Source code for scrapereads.reads.author

"""
Defines an Author from ``Good Reads``.
Connect to https://www.goodreads.com/ to extract quotes and books from famous authors.
"""

import warnings
import langdetect

from scrapereads.utils import *
from scrapereads import scrape
from scrapereads.meta import AuthorMeta
import scrapereads.reads as greads


[docs]class Author(AuthorMeta): """ Defines an author, from the page info from ``https://www.goodreads.com/``. * :attr:`name`: name of the author. * :attr:`key`: key id of the author. * :attr:`url`: url page of the author. """ def __init__(self, author_id, author_name=None): super().__init__(author_id, author_name=author_name) self._quotes = [] self._books = [] self._info = None
[docs] @classmethod def from_url(cls, url): """Construct the class from an url. Args: url (string): url. Returns: Author """ author_id = eval(url.split('/')[-1].split('.')[0]) author_name = url.split('/')[-1].split('.')[1] return Author(author_id, author_name=author_name)
[docs] def get_info(self): """Get author information (genres, influences, description etc.) Returns: dict """ if not self._info: soup = self._soup or self.connect() self._info = scrape.get_author_info(soup) return self._info
[docs] def add_quote(self, quote): """Add a quote to an Author. Args: quote (Quote or string): quote or text to add. """ quote.author_name = self.author_name quote.author_id = self.author_id quote.register_author(self) self._quotes.append(quote)
[docs] def add_book(self, book): """Add a book to an Author. Args: book (Book): book or book's name to add. """ book.author_name = self.author_name book.author_id = self.author_id book.register_author(self) self._books.append(book)
def _search_books(self): # Scrape books from tha author book page from scrapereads.com self._books = [] npage = 1 href = f'/author/list/{self.author_id}.{name_to_goodreads(self.author_name)}' search = True while search: # Didn't found any quotes search = False # Navigate through the next page href_page = href + self._next_page(npage=npage) soup = self.connect(href=href_page) npage += 1 for book_tr in scrape.scrape_author_books(soup): # Books found search = True book_title = scrape.get_author_book_title(book_tr) book_href = book_title.get('href') book_id = book_href.split('/')[-1].split('-')[0].split('.')[0] book_name = book_title.text.strip().title() ratings = scrape.get_author_book_ratings(book_tr).contents[-1] edition = scrape.get_author_book_edition(book_tr) edition = edition.text.strip() if edition else None year = scrape.get_author_book_date(book_tr) book = greads.Book(self.author_id, book_id, book_name=book_name, author_name=self.author_name, edition=edition, year=year, ratings=ratings) self.add_book(book) yield book def _search_quotes(self): # Scrape quotes from the author qutoe page from scrapereads.com self._quotes = [] npage = 1 href = f'/author/quotes/{self.author_id}.{name_to_goodreads(self.author_name)}' search = True while search: # Didn't found any quotes search = False # Navigate through the next page href_page = href + self._next_page(npage=npage) soup = self.connect(href=href_page) npage += 1 for quote_div in scrape.scrape_quotes(soup): # Quotes found search = True quote_text = process_quote_text(scrape.get_quote_text(quote_div)) quote_likes = eval(scrape.get_quote_likes(quote_div).text.replace('likes', '').strip()) quote_href = scrape.get_quote_likes(quote_div).get('href') quote_id = quote_href.split('-')[0].split('.')[0] quote_tags = [] for tag in scrape.scrape_quote_tags(quote_div): quote_tags.append(tag.text.strip()) quote = greads.Quote(self.author_id, quote_id, text=quote_text, author_name=self.author_name, tags=quote_tags, likes=quote_likes) # Register the quote to a book if it exists book_title = scrape.get_quote_book(quote_div) # The quote is linked to a book if book_title: book_href = book_title.get('href') book_id = book_href.split('/')[-1].split('-')[0].split('.')[0] book_name = book_title.text.strip() # Look for an already saved book, if it does not exists create it and add it # However, if there are no books register using the ``search_book()`` method will automatically # look for ALL books, which is time consuming. # Instead, it will look for book already saved in the cache, and add it if it does not exist. book_exist = True if book_id in [book.book_id for book in self._books] else False if book_exist: book = self.search_book(book_id) else: book = greads.Book(self.author_id, book_id, book_name=book_name, author_name=self.author_name) self.add_book(book) book.add_quote(quote) # Add the quote and return it self.add_quote(quote) yield quote
[docs] def quotes(self, cache=True): """Yield all quotes from an author address. This function extract online data from `Good Reads` if nothing is already saved in the cache. Args: cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: yield Quote """ if len(self._quotes) > 0 and cache: yield from self._quotes else: yield from self._search_quotes()
# TODO: merge this function with Book.get_quotes()
[docs] def get_quotes(self, lang=None, top_k=None, cache=True): """Get all quotes from an author address. Args: lang (string): language to pick up quotes. top_k (int): number of quotes to retrieve (ordered by popularity). cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: list(Quote) """ # Reset the quotes saved in the cache if its length is under the threshold if top_k and len(self._quotes) < top_k: self._quotes = [] # Get the top-k quotes, ordered from the author's quote page (usually it's ordered by popularity) quotes = [] for i, quote in enumerate(self.quotes(cache=cache)): if not lang or langdetect.detect(quote.text) == lang: quote.register_author(self) quotes.append(quote) if top_k and i + 1 >= top_k: break return quotes
[docs] def books(self, cache=True): """Get all books from an author address. This function extract online data from `Good Reads` if nothing is already saved in the cache. Args: cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: yield Quote """ if len(self._books) > 0 and cache: yield from self._books else: yield from self._search_books()
[docs] def get_books(self, top_k=None, cache=True): """Get all books from an author address. Args: top_k (int): number of books to return. cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: list(Book) """ # Reset the books saved in the cache if its length is under the threshold if top_k and len(self._books) < top_k: self._books = [] # Get the top-k books, ordered from the author's book page books = [] for i, book in enumerate(self.books(cache=cache)): book.register_author(self) books.append(book) if top_k and i + 1 >= top_k: break return books
# TODO: use for loop with yield
[docs] def search_book(self, book_id, attr='book_id', cache=True): """Search a book from the books saved in the author's cache. Args: book_id (string): book id (or name) to look for. attr (string, optional): attribute to search the book from. Options are ``'book_id'`` and ``'book_name'`` cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: Book """ for book in self.books(cache=cache): if str(book_id) == str(getattr(book, attr)): book.register_author(self) return book
# TODO: use for loop with yield
[docs] def search_quote(self, quote_id, attr='quote_id', cache=True): """Search a quote from the books saved in the author's cache. Args: quote_id (string): quote'id to look for. attr (string, optional): attribute to search the quote from. Options are ``'quote_id'`` and ``'quote_name'`` cache (bool): if ``True``, will look for cache items only (and won't scrape online). Returns: Book """ for quote in self.quotes(cache=cache): if str(quote_id) == str(getattr(quote, attr)): quote.register_author(self) return quote
[docs] def get_similar_authors(self, top_k=None): """Get similar artists from the author. Args: top_k (int): number of authors to retrieve (ordered by popularity). Returns: list(Author) """ href = f'/author/similar/{self.author_id}.{name_to_goodreads(self.author_name)}' soup = self.connect(href=href) authors = [] authors_container = soup.findAll('a', attrs={'class': 'gr-h3 gr-h3--serif gr-h3--noMargin'}) for i, author in enumerate(authors_container[1:]): url_author = author.attrs['href'] authors.append(Author.from_url(url_author)) if top_k and i + 1 >= top_k: break return authors
# TODO: finish and add nested JSON option
[docs] def to_json(self, encode=None): """Encode the author to a JSON format. Args: encode (string): encode to ASCII format or not. Returns: dict """ data = { 'author': self.author_name, **self.get_info() } if encode: return serialize_dict(data) return data