scrape_ob/scrape_ob/scrape.py

"""
Scraping and iteration code
"""
from pathlib import Path

from tqdm import tqdm

from scrape_ob import OB_ROOT
from bs4 import BeautifulSoup
from typing import List, Union

from scrape_ob.parse import  Project
from scrape_ob.util import get_agent, project_name


def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
    root = get_agent(root_url)
    root = BeautifulSoup(root.content, 'lxml')
    return root

def list_category_urls(root:BeautifulSoup) -> List[str]:
    categories = root.select(Category.selector)
    return [c.get('href') for c in categories]

def list_projects(category_url:str) -> List[str]:
    category = get_agent(category_url)
    category = BeautifulSoup(category.content, 'lxml')

    projects = category.select(Project.selectors.index)
    return [p.get('href') for p in projects]

def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]:
    """
    Wraps the other scraping functions to return a list of all projects
    in all categories as a flat list

    Args:
        root_url: Root of the 'open source projects' page

    Returns:
        list[str]: List of project page URLs
    """
    root = get_root(root_url)
    categories = list_category_urls(root)
    projects = []
    for category in categories:
        projects.extend(list_projects(category))
    if dedupe:
        projects = list(set(projects))
    return projects


def download_all(
        root_url:str=OB_ROOT,
        output_folder:Union[Path,str]=Path('./html'),
        verbose:bool=True
):
    """
    Download all OB pages, saving raw HTML to disk.

    Args:
        root_url (str): Root of the open source tools page
        output_folder (:class:`pathlib.Path`): Directory to save HTML sources to
        verbose (bool): Print status messages and progress bars
    """
    output_folder = Path(output_folder)
    output_folder.mkdir(exist_ok=True)

    # list all project urls
    projects = list_all_projects(root_url)

    # avoid downloading files that we already have
    existing_files = [p.name for p in output_folder.glob("*.html")]
    filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files]
    if verbose:
        print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}')
        pbar = tqdm(total=len(filtered_projects))
    else:
        pbar = None

    # Download everything to .html files!
    for project in filtered_projects:
        page = get_agent(project)
        output_file = output_folder / (project_name(project) + '.html')
        with open(output_file, 'wb') as output:
            output.write(page.content)
        if verbose:
            pbar.update()