scrape_ob/scrape_ob/util.py

39 lines
787 B
Python

from random import choice
from urllib.parse import urlsplit
import requests
from scrape_ob.agents import USER_AGENTS
def get_agent(url, **kwargs) -> requests.Response:
# fake a user agent because apparently they reject blank connections lol
kwargs.update({
"headers":{
"User-Agent": choice(USER_AGENTS)
}
})
gotten = requests.get(
url,
**kwargs
)
return gotten
def project_name(url:str) -> str:
"""
Get the project name from the URL of an OB page
Examples:
'https://edspace.american.edu/openbehavior/project/ethoscopes/'
becomes
'ethoscopes'
Args:
url: URL of project page
Returns:
str
"""
return urlsplit(url).path.strip('/').split('/')[-1]