61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
"""
|
|
User-facing runtime functions.
|
|
|
|
We try and be kind by keeping and caching local copies of the HTML
|
|
instead of hitting the server every time, so we should always start with
|
|
the `download_all` function, which will skip any project files
|
|
that we already have downloaded in the output_folder
|
|
"""
|
|
from pathlib import Path
|
|
import argparse
|
|
|
|
from scrape_ob.scrape import download_all
|
|
from scrape_ob.constants import OB_ROOT
|
|
from scrape_ob.parse import Project, parse_folder
|
|
from typing import List, Optional
|
|
|
|
|
|
def argparser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
prog="scrape_ob",
|
|
description="Scrape Open Behavior and return structured data",
|
|
epilog="Be kind and always give credit where labor has been done"
|
|
)
|
|
parser.add_argument(
|
|
'-u', '--url', default=OB_ROOT, required=False,
|
|
help=f"Root URL for open behavior's open source project directory. Default is {OB_ROOT}"
|
|
)
|
|
parser.add_argument(
|
|
'-o', '--output', default='./html', required=False, type=Path,
|
|
help="Output directory to store downloaded html files in. Default is ./html"
|
|
)
|
|
parser.add_argument(
|
|
'-d', '--download', action='store_true',
|
|
help="Just download html files without parsing them"
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
def main() -> Optional[List[Project]]:
|
|
parser = argparser()
|
|
args = parser.parse_args()
|
|
|
|
output = Path(args.output)
|
|
|
|
if args.download:
|
|
print('Just downloading files without parsing')
|
|
|
|
download_all(root_url=args.url, output_folder=output)
|
|
|
|
if args.download:
|
|
# just download, don't parse
|
|
return None
|
|
|
|
projects = parse_folder(path=output)
|
|
return projects
|
|
|
|
|
|
|
|
|