Source code for scrapereads.scrape

"""
Scrape quotes, books and authors from ``Good Reads`` website.
"""

import bs4
from .utils import *


[docs]def get_author_name(soup):
    """Get the author's name from its main page.

    Args:
        soup (bs4.element.Tag): connection to the author page.

    Returns:
        string: name of the author.

    Examples::
        >>> from scrapereads import connect
        >>> url = 'https://www.goodreads.com/author/show/1077326'
        >>> soup = connect(url)
        >>> get_author_name(soup)
            J.K. Rowling

    """
    author_h1 = soup.find('h1', attrs={'class': 'authorName'})
    return author_h1.find('span').text


[docs]def get_author_desc(soup):
    """Get the author description / biography.

    Args:
        soup (bs4.element.Tag): connection to the author page.

    Returns:
        str: long description of the author.

    Examples::
        >>> from scrapereads import connect
        >>> url = 'https://www.goodreads.com/author/show/1077326'
        >>> soup = connect(url)
        >>> get_author_desc(soup)
            See also: Robert Galbraith
            Although she writes under the pen name J.K. Rowling, pronounced like rolling,
            her name when her first Harry Potter book was published was simply Joanne Rowling.
            ...

    """
    author_info_desc = soup.find('div', attrs={'class': 'aboutAuthorInfo'})
    author_info_long = author_info_desc.findAll('span')[-1]
    long_desc = ""
    for sentence in author_info_long.children:
        if isinstance(sentence, bs4.element.Tag):
            if sentence.name == 'br':
                long_desc += '\n'
            else:
                long_desc += sentence.text
        else:
            long_desc += sentence
    long_desc = long_desc.replace('’', "'")
    return long_desc


[docs]def get_author_info(soup):
    """Get all information from an author (genres, influences, website etc.).

    Args:
        soup (bs4.element.Tag): author page connection.

    Returns:
        dict

    """
    container = soup.find('div', attrs={'class': 'rightContainer'})
    author_info = {}
    data_div = container.find('br', attrs={'class': 'clear'})
    while data_div:
        if data_div.name:
            data_class = data_div.get('class')[0]
            # Information section is finished
            if data_class == 'aboutAuthorInfo':
                break
            # Key elements
            elif data_class == 'dataTitle':
                key = data_div.text.strip()
                author_info[key] = []
            # Born section
            if data_div.text == 'Born':
                data_div = data_div.next_sibling
                author_info[key].append(data_div.strip())
            # Influences section
            elif data_div.text == 'Influences':
                data_div = data_div.next_sibling.next_sibling
                data_items = data_div.findAll('span')[-1].findAll('a')
                for data_a in data_items:
                    author_info[key].append(data_a.text.strip())
            # Member since section
            elif data_div.text == 'Member Since':
                data_div = data_div.next_sibling.next_sibling
                author_info[key].append(data_div.text.strip())
            # Genre, website and other sections
            else:
                data_items = data_div.findAll('a')
                for data_a in data_items:
                    author_info[key].append(data_a.text.strip())
        data_div = data_div.next_sibling
    author_info.update({'Description': get_author_desc(soup)})
    return author_info


[docs]def scrape_quotes_container(soup):
    """Get the quote container from a quote page.

    Args:
        soup (bs4.element.Tag): connection to the quote page.

    Returns:
        bs4.element.Tag

    """
    return soup.findAll('div', attrs={'class': 'quotes'})


[docs]def scrape_quotes(soup):
    """Retrieve all ``<div>`` quote element from a quote page.

    Args:
        soup (bs4.element.Tag): connection to the quote page.

    Returns:
        yield bs4.element.Tag

    """
    for container_div in scrape_quotes_container(soup):
        quote_div = container_div.find('div', attrs={'class': 'quote'})
        while quote_div:
            if quote_div.name == 'div' and quote_div.get('class') and 'quote' in quote_div.get('class'):
                yield quote_div
            quote_div = quote_div.next_sibling


[docs]def get_quote_text(quote_div):
    """Get the text from a ``<div>`` quote element.

    Args:
        quote_div (bs4.element.Tag): ``<div>`` quote element to extract the text.

    Returns:
        string

    """
    quote_text = ''
    text_iterator = quote_div.find('div', attrs={'class': 'quoteText'}).children
    for text in text_iterator:
        if text.name == 'br':
            quote_text += '\n'
        elif not text.name:
            quote_text += text.strip()
    quote_text = process_quote_text(quote_text)
    return quote_text


[docs]def scrape_quote_tags(quote_div):
    """Scrape tags from a ``<div>`` quote element.

    Args:
        quote_div (bs4.element.Tag): ``<div>`` quote element from a quote page.

    Returns:
        yield ``<a>`` tags

    """
    tags_container = quote_div.find('div', attrs={'class': 'greyText smallText left'})
    if tags_container:
        for tag in tags_container.children:
            if tag.name == 'a':
                yield tag
    return None


[docs]def get_quote_book(quote_div):
    """Get the reference (book) from a ``<div>`` quote element.

    Args:
        quote_div (bs4.element.Tag): ``<div>`` quote element from a quote page.

    Returns:
        bs4.element.Tag

    """
    quote_details = quote_div.find('div', attrs={'class': 'quoteText'})
    return quote_details.find('a', attrs={'class': 'authorOrTitle'})


[docs]def get_quote_author_name(quote_div):
    """Get the author's name from a ``<div>`` quote element.

    Args:
        quote_div (bs4.element.Tag): ``<div>`` quote element from a quote page.

    Returns:
        string

    """
    quote_text = quote_div.find('div', attrs={'class': 'quoteText '})
    author_name = quote_text.find('span', attrs={'class': 'authorOrTitle'}).text
    return remove_punctuation(author_name).title()


[docs]def get_quote_likes(quote_div):
    """Get the likes ``<a>`` tag from a ``<div>`` quote element.

    Args:
        quote_div (bs4.element.Tag): ``<div>`` quote element from a quote page.

    Returns:
        bs4.element.Tag: ``<a>`` tag for likes.

    """
    quote_footer = quote_div.find('div', attrs={'class': 'quoteFooter'})
    return quote_footer.find('a', attrs={'class': 'smallText'})


# TODO: deprecate this
[docs]def get_quote_name_id(quote_div):
    """Get the name and id of a ``<div>`` quote element.

    Args:
        quote_div (bs4.element.Tag): ``<div>`` quote element from a quote page.

    Returns:
        tuple: id and name.

    """
    quote_href = get_quote_likes(quote_div).get('href')
    quote_id = quote_href.split('/')[-1].split('-')[0]
    quote_name = '-'.join(quote_href.split('/')[-1].split('-')[1:])
    return quote_id, quote_name


[docs]def scrape_author_books(soup):
    """Retrieve books from an author's page.

    Args:
        soup (bs4.element.Tag): connection to an author books page.

    Returns:
        yield bs4.element.Tag: ``<tr>`` element.

    """
    table_tr = soup.find('tr')
    while table_tr:
        if table_tr.name == 'tr':
            yield table_tr
        table_tr = table_tr.next_sibling


[docs]def get_author_book_title(book_tr):
    """Get the book title ``<a>`` element from a table ``<tr>`` element from an author page.

    Args:
        book_tr (bs4.element.Tag): ``<tr>`` book element.

    Returns:
        bs4.element.Tag: book title ``<a>`` element.

    Examples::
        >>> for book_tr in scrape_author_books(soup):
        ...     book_title = get_author_book_title(book_tr)
        ...     print(book_title.text.strip(), book_title.get('href'))
            The Bell Jar /book/show/6514.The_Bell_Jar
            Ariel /book/show/395090.Ariel
            The Collected Poems /book/show/31426.The_Collected_Poems
            The Unabridged Journals of Sylvia Plath /book/show/11623.The_Unabridged_Journals_of_Sylvia_Plath

    """
    return book_tr.find('a', attrs={'class': 'bookTitle'})


[docs]def get_author_book_author(book_tr):
    """Get the author ``<a>`` element from a table ``<tr>`` element.

    Args:
        book_tr (bs4.element.Tag): ``<tr>`` book element.

    Returns:
        bs4.element.Tag: author name ``<a>`` element.

    Examples::
        >>> for book_tr in scrape_author_books(soup):
        ...     book_author = get_author_book_author(book_tr)
        ...     print(book_author.text, book_author.get('href'))
            Sylvia Plath https://www.goodreads.com/author/show/4379.Sylvia_Plath
            Sylvia Plath https://www.goodreads.com/author/show/4379.Sylvia_Plath
            Sylvia Plath https://www.goodreads.com/author/show/4379.Sylvia_Plath
            Sylvia Plath https://www.goodreads.com/author/show/4379.Sylvia_Plath
            Sylvia Plath https://www.goodreads.com/author/show/4379.Sylvia_Plath

    """
    return book_tr.find('a', attrs={'class': 'authorName'})


[docs]def get_author_book_ratings(book_tr):
    """Get the ratings ``<span>`` element from a table ``<tr>`` element from an author page.

    Args:
        book_tr (bs4.element.Tag): ``<tr>`` book element.

    Returns:
        bs4.element.Tag: ratings ``<span>`` element.

    Examples::
        >>> for book_tr in scrape_author_books(soup):
        ...     ratings_span = get_author_book_ratings(book_tr)
        ...     print(ratings_span.contents[-1])
             4.55 avg rating — 2,414 ratings
             3.77 avg rating — 1,689 ratings
             4.28 avg rating — 892 ratings
             4.54 avg rating — 490 ratings
             ...

    """
    return book_tr.find('span', attrs={'class': 'minirating'})


[docs]def get_author_book_edition(book_tr):
    """Get the edition ``<a>`` element from a table ``<tr>`` element from an author page.

    Args:
        book_tr (bs4.element.Tag): ``<tr>`` book element.

    Returns:
        bs4.element.Tag: book edition ``<a>`` element.

    Examples::
        >>> for book_tr in scrape_author_books(soup):
        ...     book_edition = get_author_book_edition(book_tr)
        ...     if book_edition:
        ...         print(book_edition.text, book_edition.get('href'))
        ...         print()
            493 editions /work/editions/1385044-the-bell-jar
            80 editions /work/editions/1185316-ariel
            30 editions /work/editions/1003095-the-collected-poems
            45 editions /work/editions/3094683-the-unabridged-journals-of-sylvia-plath
            ...

    """
    book_details = book_tr.find('span', attrs={'class': 'greyText smallText uitext'})
    return book_details.find('a', attrs={'class': 'greyText'})


[docs]def get_author_book_date(book_tr):
    """Get the published date from a table ``<tr>`` element from an author page.

    Args:
        book_tr (bs4.element.Tag): ``<tr>`` book element.

    Returns:
        int: date of publication

    Examples::
        >>> for book_tr in scrape_author_books(soup):
        ...     book_date = get_author_book_date(book_tr)
        ...     print(book_date)
            None
            None
            1958
            2009
            ...

    """
    book_details = book_tr.find('span', attrs={'class': 'greyText smallText uitext'})
    book_publish = book_details.contents[-1].replace('—', '').replace('\n', '')
    book_date = book_publish.replace('published', '').strip()
    book_date = eval(book_date) if book_date != '' else None
    return book_date


[docs]def get_book_quote_page(soup):
    """Find the ``<a>`` element pointing to the quote page of a book.

    Args:
        soup (bs4.element.Tag):

    Returns:

    """
    quote_div = soup.findAll('div', attrs={'class': ' clearFloats bigBox'})
    if quote_div:
        return quote_div[-1].find('a')
    return None