Source code for scrapereads.connect

"""
A scrapper is used to connect to a website and extract data.
"""

# import libraries
import warnings
import bs4
import urllib.request
import time

# Global variables
SLEEP = 0
VERBOSE = True
USER = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'


def set_sleep(value):
    global SLEEP
    SLEEP = value


def set_verbose(value):
    global VERBOSE
    VERBOSE = value


def set_user(user):
    if user:
        global USER
        USER = user


[docs]def connect(url): """Connect to an URL. Args: url (string): url path sleep (float): number of seconds to sleep before connection. verbose (bool): print the url if ``True``. Returns: soup """ # Slow down the script to bypass bot detections time.sleep(SLEEP) # Prevent ERROR: 403 - Forbidden # user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' # user_agent = 'Mozilla/5.0' headers = {'User-Agent': USER} try: req = urllib.request.Request(url, headers=headers) page = urllib.request.urlopen(req) soup = bs4.BeautifulSoup(page, 'lxml') if VERBOSE: print(f"Successfully connected to {url}") except urllib.error.HTTPError as e: warn_msg = f'\n{e}. Failed to connect to {url}.\n' \ f'Please verify the spelling or make sure that this page exists. `None` was returned.' warnings.warn(warn_msg, RuntimeWarning) soup = None return soup