"""
Defines an Author from ``Good Reads``.
Connect to https://www.goodreads.com/ to extract quotes and books from famous authors.
"""
import warnings
import langdetect
from scrapereads.utils import *
from scrapereads import scrape
from scrapereads.meta import AuthorMeta
import scrapereads.reads as greads
[docs]class Author(AuthorMeta):
"""
Defines an author, from the page info from ``https://www.goodreads.com/``.
* :attr:`name`: name of the author.
* :attr:`key`: key id of the author.
* :attr:`url`: url page of the author.
"""
def __init__(self, author_id, author_name=None):
super().__init__(author_id, author_name=author_name)
self._quotes = []
self._books = []
self._info = None
[docs] @classmethod
def from_url(cls, url):
"""Construct the class from an url.
Args:
url (string): url.
Returns:
Author
"""
author_id = eval(url.split('/')[-1].split('.')[0])
author_name = url.split('/')[-1].split('.')[1]
return Author(author_id, author_name=author_name)
[docs] def get_info(self):
"""Get author information (genres, influences, description etc.)
Returns:
dict
"""
if not self._info:
soup = self._soup or self.connect()
self._info = scrape.get_author_info(soup)
return self._info
[docs] def add_quote(self, quote):
"""Add a quote to an Author.
Args:
quote (Quote or string): quote or text to add.
"""
quote.author_name = self.author_name
quote.author_id = self.author_id
quote.register_author(self)
self._quotes.append(quote)
[docs] def add_book(self, book):
"""Add a book to an Author.
Args:
book (Book): book or book's name to add.
"""
book.author_name = self.author_name
book.author_id = self.author_id
book.register_author(self)
self._books.append(book)
def _search_books(self):
# Scrape books from tha author book page from scrapereads.com
self._books = []
npage = 1
href = f'/author/list/{self.author_id}.{name_to_goodreads(self.author_name)}'
search = True
while search:
# Didn't found any quotes
search = False
# Navigate through the next page
href_page = href + self._next_page(npage=npage)
soup = self.connect(href=href_page)
npage += 1
for book_tr in scrape.scrape_author_books(soup):
# Books found
search = True
book_title = scrape.get_author_book_title(book_tr)
book_href = book_title.get('href')
book_id = book_href.split('/')[-1].split('-')[0].split('.')[0]
book_name = book_title.text.strip().title()
ratings = scrape.get_author_book_ratings(book_tr).contents[-1]
edition = scrape.get_author_book_edition(book_tr)
edition = edition.text.strip() if edition else None
year = scrape.get_author_book_date(book_tr)
book = greads.Book(self.author_id, book_id, book_name=book_name,
author_name=self.author_name, edition=edition, year=year, ratings=ratings)
self.add_book(book)
yield book
def _search_quotes(self):
# Scrape quotes from the author qutoe page from scrapereads.com
self._quotes = []
npage = 1
href = f'/author/quotes/{self.author_id}.{name_to_goodreads(self.author_name)}'
search = True
while search:
# Didn't found any quotes
search = False
# Navigate through the next page
href_page = href + self._next_page(npage=npage)
soup = self.connect(href=href_page)
npage += 1
for quote_div in scrape.scrape_quotes(soup):
# Quotes found
search = True
quote_text = process_quote_text(scrape.get_quote_text(quote_div))
quote_likes = eval(scrape.get_quote_likes(quote_div).text.replace('likes', '').strip())
quote_href = scrape.get_quote_likes(quote_div).get('href')
quote_id = quote_href.split('-')[0].split('.')[0]
quote_tags = []
for tag in scrape.scrape_quote_tags(quote_div):
quote_tags.append(tag.text.strip())
quote = greads.Quote(self.author_id,
quote_id,
text=quote_text,
author_name=self.author_name,
tags=quote_tags,
likes=quote_likes)
# Register the quote to a book if it exists
book_title = scrape.get_quote_book(quote_div)
# The quote is linked to a book
if book_title:
book_href = book_title.get('href')
book_id = book_href.split('/')[-1].split('-')[0].split('.')[0]
book_name = book_title.text.strip()
# Look for an already saved book, if it does not exists create it and add it
# However, if there are no books register using the ``search_book()`` method will automatically
# look for ALL books, which is time consuming.
# Instead, it will look for book already saved in the cache, and add it if it does not exist.
book_exist = True if book_id in [book.book_id for book in self._books] else False
if book_exist:
book = self.search_book(book_id)
else:
book = greads.Book(self.author_id, book_id, book_name=book_name, author_name=self.author_name)
self.add_book(book)
book.add_quote(quote)
# Add the quote and return it
self.add_quote(quote)
yield quote
[docs] def quotes(self, cache=True):
"""Yield all quotes from an author address.
This function extract online data from `Good Reads` if nothing is already saved in the cache.
Args:
cache (bool): if ``True``, will look for cache items only (and won't scrape online).
Returns:
yield Quote
"""
if len(self._quotes) > 0 and cache:
yield from self._quotes
else:
yield from self._search_quotes()
# TODO: merge this function with Book.get_quotes()
[docs] def get_quotes(self, lang=None, top_k=None, cache=True):
"""Get all quotes from an author address.
Args:
lang (string): language to pick up quotes.
top_k (int): number of quotes to retrieve (ordered by popularity).
cache (bool): if ``True``, will look for cache items only (and won't scrape online).
Returns:
list(Quote)
"""
# Reset the quotes saved in the cache if its length is under the threshold
if top_k and len(self._quotes) < top_k:
self._quotes = []
# Get the top-k quotes, ordered from the author's quote page (usually it's ordered by popularity)
quotes = []
for i, quote in enumerate(self.quotes(cache=cache)):
if not lang or langdetect.detect(quote.text) == lang:
quote.register_author(self)
quotes.append(quote)
if top_k and i + 1 >= top_k:
break
return quotes
[docs] def books(self, cache=True):
"""Get all books from an author address.
This function extract online data from `Good Reads` if nothing is already saved in the cache.
Args:
cache (bool): if ``True``, will look for cache items only (and won't scrape online).
Returns:
yield Quote
"""
if len(self._books) > 0 and cache:
yield from self._books
else:
yield from self._search_books()
[docs] def get_books(self, top_k=None, cache=True):
"""Get all books from an author address.
Args:
top_k (int): number of books to return.
cache (bool): if ``True``, will look for cache items only (and won't scrape online).
Returns:
list(Book)
"""
# Reset the books saved in the cache if its length is under the threshold
if top_k and len(self._books) < top_k:
self._books = []
# Get the top-k books, ordered from the author's book page
books = []
for i, book in enumerate(self.books(cache=cache)):
book.register_author(self)
books.append(book)
if top_k and i + 1 >= top_k:
break
return books
# TODO: use for loop with yield
[docs] def search_book(self, book_id, attr='book_id', cache=True):
"""Search a book from the books saved in the author's cache.
Args:
book_id (string): book id (or name) to look for.
attr (string, optional): attribute to search the book from. Options are ``'book_id'`` and ``'book_name'``
cache (bool): if ``True``, will look for cache items only (and won't scrape online).
Returns:
Book
"""
for book in self.books(cache=cache):
if str(book_id) == str(getattr(book, attr)):
book.register_author(self)
return book
# TODO: use for loop with yield
[docs] def search_quote(self, quote_id, attr='quote_id', cache=True):
"""Search a quote from the books saved in the author's cache.
Args:
quote_id (string): quote'id to look for.
attr (string, optional): attribute to search the quote from. Options are ``'quote_id'`` and ``'quote_name'``
cache (bool): if ``True``, will look for cache items only (and won't scrape online).
Returns:
Book
"""
for quote in self.quotes(cache=cache):
if str(quote_id) == str(getattr(quote, attr)):
quote.register_author(self)
return quote
[docs] def get_similar_authors(self, top_k=None):
"""Get similar artists from the author.
Args:
top_k (int): number of authors to retrieve (ordered by popularity).
Returns:
list(Author)
"""
href = f'/author/similar/{self.author_id}.{name_to_goodreads(self.author_name)}'
soup = self.connect(href=href)
authors = []
authors_container = soup.findAll('a', attrs={'class': 'gr-h3 gr-h3--serif gr-h3--noMargin'})
for i, author in enumerate(authors_container[1:]):
url_author = author.attrs['href']
authors.append(Author.from_url(url_author))
if top_k and i + 1 >= top_k:
break
return authors
# TODO: finish and add nested JSON option
[docs] def to_json(self, encode=None):
"""Encode the author to a JSON format.
Args:
encode (string): encode to ASCII format or not.
Returns:
dict
"""
data = {
'author': self.author_name,
**self.get_info()
}
if encode:
return serialize_dict(data)
return data