Source code for scrapereads.connect

"""
A scrapper is used to connect to a website and extract data.
"""

# import libraries
import warnings
import bs4
import urllib.request
import time

# Global variables
SLEEP = 0
VERBOSE = True
USER = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'


def set_sleep(value):
    global SLEEP
    SLEEP = value


def set_verbose(value):
    global VERBOSE
    VERBOSE = value


def set_user(user):
    if user:
        global USER
        USER = user


[docs]def connect(url):
    """Connect to an URL.

    Args:
        url (string): url path
        sleep (float): number of seconds to sleep before connection.
        verbose (bool): print the url if ``True``.

    Returns:
        soup

    """
    # Slow down the script to bypass bot detections
    time.sleep(SLEEP)

    # Prevent ERROR: 403 - Forbidden
    # user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    # user_agent = 'Mozilla/5.0'
    headers = {'User-Agent': USER}

    try:
        req = urllib.request.Request(url, headers=headers)
        page = urllib.request.urlopen(req)
        soup = bs4.BeautifulSoup(page, 'lxml')
        if VERBOSE:
            print(f"Successfully connected to {url}")

    except urllib.error.HTTPError as e:
        warn_msg = f'\n{e}. Failed to connect to {url}.\n' \
                   f'Please verify the spelling or make sure that this page exists. `None` was returned.'
        warnings.warn(warn_msg, RuntimeWarning)
        soup = None

    return soup