Source code for scrapereads.reads.book

"""
Defines a book from an Author.
"""

import warnings
import langdetect

from scrapereads.utils import *
from scrapereads import scrape
from scrapereads.meta import BookMeta
import scrapereads.reads as greads


[docs]class Book(BookMeta): def __init__(self, author_id, book_id, book_name=None, author_name=None, edition=None, year=None, ratings=None): super().__init__(author_id, book_id, book_name=book_name, author_name=author_name, edition=edition, year=year) self.ratings = ratings self._quotes = [] def _search_quotes(self): # Scrape online quotes from goodreads.com self._quotes = [] soup = self.connect() href_a = scrape.get_book_quote_page(soup) if href_a: href = href_a.get('href') npage = 1 search = True while search: # Didn't found any quotes search = False # Navigate through the next page href_page = href + self._next_page(npage=npage) soup = self.connect(href=href_page) npage += 1 for quote_div in scrape.scrape_quotes(soup): # Quotes found search = True quote_text = process_quote_text(scrape.get_quote_text(quote_div)) quote_likes = eval(scrape.get_quote_likes(quote_div).text.replace('likes', '').strip()) quote_href = scrape.get_quote_likes(quote_div).get('href') quote_id = quote_href.split('-')[0].split('.')[0] quote_tags = [] for tag in scrape.scrape_quote_tags(quote_div): quote_tags.append(tag.text.strip()) quote = greads.Quote(self.author_id, quote_id, text=quote_text, author_name=self.author_name, tags=quote_tags, likes=quote_likes) self.add_quote(quote) yield quote
[docs] def quotes(self, cache=True): """Yield all quotes from a book address. This function extract online data from `Good Reads` if nothing is already saved in the cache. Args: cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: yield Quote """ if len(self._quotes) > 0 and cache: yield from self._quotes else: yield from self._search_quotes()
[docs] def get_quotes(self, lang=None, top_k=None, cache=True): """Get all quotes from a book address. Args: lang (string): language to pick up quotes. top_k (int): number of quotes to retrieve (ordered by popularity). cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: list(Quote) """ # Reset the quotes saved in the cache if its length is under the threshold if top_k and len(self._quotes) < top_k: self._quotes = [] # Get the top-k quotes, ordered from the book's quote page (usually it's ordered by popularity) quotes = [] for i, quote in enumerate(self.quotes(cache=cache)): if not lang or langdetect.detect(quote.text) == lang: quote.register_book(self) quotes.append(quote) if top_k and i + 1 >= top_k: break return quotes
[docs] def add_quote(self, quote): """Add a quote to the Book, that will be saved in the cache. Args: quote (Quote): quote to add. """ quote.author_name = self.author_name quote.author_id = self.author_id quote.register_author(self.get_author()) quote.register_book(self) self._quotes.append(quote)
# TODO: add nested JSON option
[docs] def to_json(self, encode='ascii'): """Encode the book to a JSON format. Returns: dict """ # Default data, without any encoding data = { 'author': self.author_name, 'book': self.book_name, 'edition': self.edition, 'year': self.year, 'quotes': [], } for quote in self.quotes(): data['quotes'].append(quote.to_json(encode=encode)) if encode: return serialize_dict(data) return data