88 lines
2.7 KiB
Python
88 lines
2.7 KiB
Python
"""
|
|
Scraping and iteration code
|
|
"""
|
|
from pathlib import Path
|
|
|
|
from tqdm import tqdm
|
|
|
|
from scrape_ob import OB_ROOT
|
|
from bs4 import BeautifulSoup
|
|
from typing import List, Union
|
|
|
|
from scrape_ob.parse import Project
|
|
from scrape_ob.util import get_agent, project_name
|
|
|
|
|
|
def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
|
|
root = get_agent(root_url)
|
|
root = BeautifulSoup(root.content, 'lxml')
|
|
return root
|
|
|
|
def list_category_urls(root:BeautifulSoup) -> List[str]:
|
|
categories = root.select(Category.selector)
|
|
return [c.get('href') for c in categories]
|
|
|
|
def list_projects(category_url:str) -> List[str]:
|
|
category = get_agent(category_url)
|
|
category = BeautifulSoup(category.content, 'lxml')
|
|
|
|
projects = category.select(Project.selectors.index)
|
|
return [p.get('href') for p in projects]
|
|
|
|
def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]:
|
|
"""
|
|
Wraps the other scraping functions to return a list of all projects
|
|
in all categories as a flat list
|
|
|
|
Args:
|
|
root_url: Root of the 'open source projects' page
|
|
|
|
Returns:
|
|
list[str]: List of project page URLs
|
|
"""
|
|
root = get_root(root_url)
|
|
categories = list_category_urls(root)
|
|
projects = []
|
|
for category in categories:
|
|
projects.extend(list_projects(category))
|
|
if dedupe:
|
|
projects = list(set(projects))
|
|
return projects
|
|
|
|
|
|
def download_all(
|
|
root_url:str=OB_ROOT,
|
|
output_folder:Union[Path,str]=Path('./html'),
|
|
verbose:bool=True
|
|
):
|
|
"""
|
|
Download all OB pages, saving raw HTML to disk.
|
|
|
|
Args:
|
|
root_url (str): Root of the open source tools page
|
|
output_folder (:class:`pathlib.Path`): Directory to save HTML sources to
|
|
verbose (bool): Print status messages and progress bars
|
|
"""
|
|
output_folder = Path(output_folder)
|
|
output_folder.mkdir(exist_ok=True)
|
|
|
|
# list all project urls
|
|
projects = list_all_projects(root_url)
|
|
|
|
# avoid downloading files that we already have
|
|
existing_files = [p.name for p in output_folder.glob("*.html")]
|
|
filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files]
|
|
if verbose:
|
|
print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}')
|
|
pbar = tqdm(total=len(filtered_projects))
|
|
else:
|
|
pbar = None
|
|
|
|
# Download everything to .html files!
|
|
for project in filtered_projects:
|
|
page = get_agent(project)
|
|
output_file = output_folder / (project_name(project) + '.html')
|
|
with open(output_file, 'wb') as output:
|
|
output.write(page.content)
|
|
if verbose:
|
|
pbar.update() |