scrape_ob/scrape_ob/util.py

from random import choice
from urllib.parse import urlsplit

import requests

from scrape_ob.agents import USER_AGENTS


def get_agent(url, **kwargs) -> requests.Response:
    # fake a user agent because apparently they reject blank connections lol
    kwargs.update({
        "headers":{
            "User-Agent": choice(USER_AGENTS)
        }
    })

    gotten = requests.get(
        url,
        **kwargs
    )
    return gotten


def project_name(url:str) -> str:
    """
    Get the project name from the URL of an OB page

    Examples:
        'https://edspace.american.edu/openbehavior/project/ethoscopes/'
        becomes
        'ethoscopes'

    Args:
        url: URL of project page

    Returns:
        str
    """
    return urlsplit(url).path.strip('/').split('/')[-1]