scrape_ob/scrape_ob/scrape.py

88 lines
2.7 KiB
Python

"""
Scraping and iteration code
"""
from pathlib import Path
from tqdm import tqdm
from scrape_ob import OB_ROOT
from bs4 import BeautifulSoup
from typing import List, Union
from scrape_ob.parse import Project
from scrape_ob.util import get_agent, project_name
def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
root = get_agent(root_url)
root = BeautifulSoup(root.content, 'lxml')
return root
def list_category_urls(root:BeautifulSoup) -> List[str]:
categories = root.select(Category.selector)
return [c.get('href') for c in categories]
def list_projects(category_url:str) -> List[str]:
category = get_agent(category_url)
category = BeautifulSoup(category.content, 'lxml')
projects = category.select(Project.selectors.index)
return [p.get('href') for p in projects]
def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]:
"""
Wraps the other scraping functions to return a list of all projects
in all categories as a flat list
Args:
root_url: Root of the 'open source projects' page
Returns:
list[str]: List of project page URLs
"""
root = get_root(root_url)
categories = list_category_urls(root)
projects = []
for category in categories:
projects.extend(list_projects(category))
if dedupe:
projects = list(set(projects))
return projects
def download_all(
root_url:str=OB_ROOT,
output_folder:Union[Path,str]=Path('./html'),
verbose:bool=True
):
"""
Download all OB pages, saving raw HTML to disk.
Args:
root_url (str): Root of the open source tools page
output_folder (:class:`pathlib.Path`): Directory to save HTML sources to
verbose (bool): Print status messages and progress bars
"""
output_folder = Path(output_folder)
output_folder.mkdir(exist_ok=True)
# list all project urls
projects = list_all_projects(root_url)
# avoid downloading files that we already have
existing_files = [p.name for p in output_folder.glob("*.html")]
filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files]
if verbose:
print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}')
pbar = tqdm(total=len(filtered_projects))
else:
pbar = None
# Download everything to .html files!
for project in filtered_projects:
page = get_agent(project)
output_file = output_folder / (project_name(project) + '.html')
with open(output_file, 'wb') as output:
output.write(page.content)
if verbose:
pbar.update()