scrape_ob/scrape_ob/main.py

"""
User-facing runtime functions.

We try and be kind by keeping and caching local copies of the HTML
instead of hitting the server every time, so we should always start with
the `download_all` function, which will skip any project files
that we already have downloaded in the output_folder
"""
from pathlib import Path
import argparse

from scrape_ob.scrape import download_all
from scrape_ob.constants import OB_ROOT
from scrape_ob.parse import Project, parse_folder
from typing import List, Optional


def argparser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="scrape_ob",
        description="Scrape Open Behavior and return structured data",
        epilog="Be kind and always give credit where labor has been done"
    )
    parser.add_argument(
        '-u', '--url', default=OB_ROOT, required=False,
        help=f"Root URL for open behavior's open source project directory. Default is {OB_ROOT}"
    )
    parser.add_argument(
        '-o', '--output', default='./html', required=False, type=Path,
        help="Output directory to store downloaded html files in. Default is ./html"
    )
    parser.add_argument(
        '-d', '--download', action='store_true',
        help="Just download html files without parsing them"
    )

    return parser


def main() -> Optional[List[Project]]:
    parser = argparser()
    args = parser.parse_args()

    output = Path(args.output)

    if args.download:
        print('Just downloading files without parsing')

    download_all(root_url=args.url, output_folder=output)

    if args.download:
        # just download, don't parse
        return None

    projects = parse_folder(path=output)
    return projects