From 0c1d340541426127fad9126ca9a4fb1d287b732b Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Sat, 29 Oct 2022 20:08:02 -0700 Subject: [PATCH] working version! --- .gitignore | 2 + README.md | 177 ++++++++++++++++++++++++++++++++++++++++++++ poetry.lock | 35 ++++++++- pyproject.toml | 6 +- scrape_ob/main.py | 61 ++++++++++++++- scrape_ob/parse.py | 164 +++++++++++++++++++++++++++++++++++----- scrape_ob/scrape.py | 70 +++++++++++++++++- scrape_ob/util.py | 21 +++++- 8 files changed, 510 insertions(+), 26 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1086f56 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +html +__pycache__ diff --git a/README.md b/README.md index 6be2902..a028c14 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,179 @@ # scrape_ob +Scrape open behavior and convert to structured data + +## Components + +- `scrape.py` - utility functions for listing and downloading files +- `parse.py` - Parse downloaded HTML files +- `main.py` - Wrapper/entrypoint scripts + +## Usage + +``` +>>> scrape_ob --help +usage: scrape_ob [-h] [-u URL] [-o OUTPUT] [-d] + +Scrape Open Behavior and return structured data + +options: + -h, --help show this help message and exit + -u URL, --url URL Root URL for open behavior's open source project directory. Default is https://edspace.american.edu/openbehavior/open-source-tools/ + -o OUTPUT, --output OUTPUT + Output directory to store downloaded html files in. Default is ./html + -d, --download Just download html files without parsing them + +Be kind and always give credit where labor has been done + +``` + +### Downloading + +We try and always work on local copies of the files so we are not requesting too much from their server, so first +we start by downloading html copies of their project descriptions. Say to the `./html` folder + +Interactively: +```python +from scrape_ob.scrape import download_all +download_all(output_folder='./html') +``` + +From the cli +```json +scrape_ob --download +``` + +### Parsing + +From there we can parse an individual project's HTML representation into a +structured one using two primary dataclasses, + +* `Project`: The primary representation of a project +* `Blurb`: The image and link boxes at the bottom of a page that link to + further details about a project. Since these don't have structure information, + and will use different terminology, embed links within text, etc. we are leaving them + as relatively unprocessed for now, pending further refinement of the parsing classes + +The Project class can parse projects given a url or a file, eg. + +```python +from scrape_ob.parse import Project + +ap = Project.from_file('html/autopilot.html') +``` + +Which gives us a structured representation of the project, which +we can access directly as attributes or pull out as a dictionary like: + +(omitting the `body` attribute for clarity) +```python +>>> print(ap.dict()) + +{ + 'name': 'AutoPilot: python framework for behavior experiments with raspberry pi', + 'url': 'https://edspace.american.edu/openbehavior/project/autopilot/', + 'date': datetime.datetime(2019, 12, 12, 0, 0), + 'tags': [ + 'automated', + 'cognition', + 'decision-making', + 'gui', + 'hardware', + 'perceptual', + 'raspberrypi', + 'sensory', + 'software' + ], + 'categories': [ + 'behavior-apparatus', + 'behavior-measurement', + 'data-analysis-software', + 'behavior-rigs', + 'behavior-analysis', + 'behavioral-tasks', + 'freely-moving', + 'integrated-systems', + 'stimuli' + ], + 'rrids': ['SCR_021448', 'SCR_021518'], + 'blurbs': [ + { + 'links': [ + 'https://www.biorxiv.org/content/10.1101/807693v1' + ], + 'name': 'Paper', + 'type': None}, + { + 'links': [ + 'https://github.com/wehr-lab/autopilot', + 'http://docs.auto-pi-lot.com/' + ], + 'name': 'Github', + 'type': None + }, + { + 'links': [ + 'https://auto-pi-lot.com/', + 'https://auto-pi-lot.com/presentation/#/' + ], + 'name': 'Website', + 'type': None + } + ], + 'docs': None, + 'repo': None, + 'paper': None +} +``` + +Note how we are able to pull out the "category" and "tags" information which +is usually hidden as part of the page metadata. + +The extra `docs`, `repo`, and `paper` fields are currently left unfilled, as we +will eventually use the `Blurb` class to parse out that information + +The body of the project description is extracted into `body` and converted to +markdown :) + +```markdown +Jonny Saunders from Michael Wehr’s lab at the University of Oregon +recently posted a preprint documenting their project Autopilot, which is +a python framework for running behavioral experiments: + +------------------------------------------------------------------------ + +[Autopilot](https://auto-pi-lot.com/)\xa0is a python framework for +behavioral experiments through utilizing\xa0[Raspberry Pi +microcontrollers](https://www.raspberrypi.org/). Autopilot incorporates +all aspects of an experiment, including the hardware, stimuli, +behavioral task paradigm, data management, data visualization, and a +user interface. The authors propose that Autopilot is the fastest, least +expensive, most flexibile behavioral system that is currently available. + +The benefit of using Autopilot is that it allows more experimental +flexibility, which lets researchers to optimize it for their specific +experimental needs. Additionally, this project exemplifies how useful a +raspberry pi can be for performing experiments and recording data. The +preprint discusses many benefits of raspberry pis, including their +speed, precision and proper data logging, and they only cost $35 (!!). +Ultimately, the authors developed Autopilot in an effort to encourage +users to write reusable, portable experiments that is put into a public +central library to push replication and reproducibility. + +*This research tool was created by your colleagues. Please acknowledge +the Principal Investigator, cite the article in which the tool was +described, and include an\xa0RRID\xa0in the Materials and Methods of your +future publications.\xa0\xa0Project portal\xa0RRID:SCR_021448; +Software\xa0RRID:SCR_021518* +``` + +The utility function `parse_folder` can be used to parse the entire downloaded +folder of html documents! + + +## TODO + +- Complete parsing of blurbs +- Export `Project` to structured markdown with YAML headers or to whatever format + open neuro ppl want +- Make mediawiki template and client code to push into the open neuro wiki diff --git a/poetry.lock b/poetry.lock index 013485d..73ab65c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -32,6 +32,14 @@ python-versions = ">=3.6.0" [package.extras] unicode-backport = ["unicodedata2"] +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" + [[package]] name = "idna" version = "3.4" @@ -96,6 +104,23 @@ category = "main" optional = false python-versions = ">=3.6" +[[package]] +name = "tqdm" +version = "4.64.1" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "urllib3" version = "1.26.12" @@ -112,7 +137,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "ca14c6295aaa4989085d6b4bd70b5e9d8299641e61f4e0b16d068d7c02a182bc" +content-hash = "bb58291e260d36ddcb0818ccf9b03cbe3e3bef25241fa3366fcfb18294720cf3" [metadata.files] beautifulsoup4 = [ @@ -127,6 +152,10 @@ charset-normalizer = [ {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, ] +colorama = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] idna = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, @@ -218,6 +247,10 @@ soupsieve = [ {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"}, {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"}, ] +tqdm = [ + {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"}, + {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"}, +] urllib3 = [ {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, diff --git a/pyproject.toml b/pyproject.toml index 13e875c..c3e7ddf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [tool.poetry] name = "scrape-ob" version = "0.1.0" -description = "" +description = "Scrape Open Behavior repository and return structured data" authors = ["sneakers-the-rat "] +repository = "https://git.jon-e.net/jonny/scrape_ob" license = "AGPL-3.0" readme = "README.md" packages = [{include = "scrape_ob"}] @@ -14,7 +15,10 @@ beautifulsoup4 = "^4.11.1" lxml = "^4.9.1" parse = "^1.19.0" pypandoc = "^1.9" +tqdm = "^4.64.1" +[tool.poetry.scripts] +scrape_ob = 'scrape_ob.main:main' [build-system] requires = ["poetry-core"] diff --git a/scrape_ob/main.py b/scrape_ob/main.py index c92054f..5c36bb6 100644 --- a/scrape_ob/main.py +++ b/scrape_ob/main.py @@ -1,3 +1,60 @@ """ -User-facing runtime functions -""" \ No newline at end of file +User-facing runtime functions. + +We try and be kind by keeping and caching local copies of the HTML +instead of hitting the server every time, so we should always start with +the `download_all` function, which will skip any project files +that we already have downloaded in the output_folder +""" +from pathlib import Path +import argparse + +from scrape_ob.scrape import download_all +from scrape_ob.constants import OB_ROOT +from scrape_ob.parse import Project, parse_folder +from typing import List, Optional + + +def argparser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="scrape_ob", + description="Scrape Open Behavior and return structured data", + epilog="Be kind and always give credit where labor has been done" + ) + parser.add_argument( + '-u', '--url', default=OB_ROOT, required=False, + help=f"Root URL for open behavior's open source project directory. Default is {OB_ROOT}" + ) + parser.add_argument( + '-o', '--output', default='./html', required=False, type=Path, + help="Output directory to store downloaded html files in. Default is ./html" + ) + parser.add_argument( + '-d', '--download', action='store_true', + help="Just download html files without parsing them" + ) + + return parser + + +def main() -> Optional[List[Project]]: + parser = argparser() + args = parser.parse_args() + + output = Path(args.output) + + if args.download: + print('Just downloading files without parsing') + + download_all(root_url=args.url, output_folder=output) + + if args.download: + # just download, don't parse + return None + + projects = parse_folder(path=output) + return projects + + + + diff --git a/scrape_ob/parse.py b/scrape_ob/parse.py index 89e5747..5bb1ac8 100644 --- a/scrape_ob/parse.py +++ b/scrape_ob/parse.py @@ -1,27 +1,73 @@ """ Models and parsers for OB pages """ -from typing import ClassVar, List, TypedDict +import re +from typing import ClassVar, List, TypedDict, Optional, Literal, TYPE_CHECKING, Union from dataclasses import dataclass from datetime import datetime +from pathlib import Path +from pprint import pformat + from bs4 import BeautifulSoup +from bs4.element import Tag + from parse import parse as _parse +from pypandoc import convert_text +from tqdm import tqdm from scrape_ob.util import get_agent -@dataclass -class Element: - selector:ClassVar[str] - soup:BeautifulSoup - - @classmethod - def from_soup(cls, soup:BeautifulSoup) -> 'Element': - return cls(soup=soup.select(cls.selector)) +BLURB_TYPES = Literal['docs', 'paper', 'repo', 'homepage'] class Category: selector = ".et_pb_module_header a" +@dataclass +class Blurb: + SELECTOR: ClassVar[str] = '.et_pb_blurb_content' + HEADER: ClassVar[str] = 'h4' + DESCRIPTION: ClassVar[str] = '.et_pb_blurb_description' + + name: str + body: Tag + description: Tag + links: List[str] + type: Optional[BLURB_TYPES] = None + + @classmethod + def from_soup(cls, blurb) -> 'Blurb': + """ + From the bs4 Tag element that contains the blurb + """ + body = blurb + name = blurb.find(cls.HEADER).text + description = blurb.select(cls.DESCRIPTION)[0] + links = list(set([a.get('href') for a in blurb.find_all('a')])) + return Blurb( + body = body, + name = name, + description = description, + links = links + ) + + def dict(self) -> dict: + return { + k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \ + if str(field._field_type) != "_FIELD_CLASSVAR" + } + + def __str__(self) -> str: + adict = self.dict() + del adict['body'] + del adict['description'] + return pformat(adict, indent=2) + + def __repr__(self) -> str: + return self.__str__() + + + @dataclass class Project: class selectors: @@ -29,20 +75,28 @@ class Project: article = "article" date = "span.published" body = ".et_pb_text_inner" - blurb = ".et_pb_blurb_content .et_pb_module_header a" + name = '.entry-title' class patterns: category = "project_category-{}" tag = "project_tag-{}" + rrid = re.compile(r"RRID:\s{0,2}([\w\d_]*)") DATE_FMT: ClassVar[str] = "%b %d, %Y" """eg. Oct 20, 2022""" name: str url: str + body: str date: datetime tags: List[str] categories: List[str] + rrids: List[str] + blurbs: List[Blurb] + docs: Optional[str] = None + repo: Optional[str] = None + paper: Optional[str] = None + @classmethod def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]: @@ -50,16 +104,92 @@ class Project: return [t[0] for t in tags if t is not None] @classmethod - def from_url(cls, url) -> 'Project': - page = BeautifulSoup(get_agent(url).content, 'lxml') - - classes = page.select(cls.selectors.article)[0].get('class') - tags = cls.parse_pattern(classes, cls.patterns.tag) - categories = cls.parse_pattern(classes, cls.patterns.category) + def from_soup(cls, page:BeautifulSoup) -> 'Project': + name = page.select(cls.selectors.name)[0].text date = datetime.strptime( page.select(cls.selectors.date)[0].text, cls.DATE_FMT ) + url = page.find('link', rel="canonical").get('href') + # parse tag & category metadata + classes = page.select(cls.selectors.article)[0].get('class') + tags = cls.parse_pattern(classes, cls.patterns.tag) + categories = cls.parse_pattern(classes, cls.patterns.category) + + # parse body text body = page.select(cls.selectors.body)[0] - # TODO Parse HTML to markdown + body_markdown = convert_text(body, to="commonmark-raw_html", format="html") + rrids = cls.patterns.rrid.findall(body_markdown) + + # parse blurbs (links to documentation, etc.) + blurbs = [Blurb.from_soup(blurb) for blurb in page.select(Blurb.SELECTOR)] + + return Project( + name = name, + date = date, + url = url, + body = body_markdown, + tags = tags, + categories = categories, + rrids = rrids, + blurbs = blurbs + ) + + @classmethod + def from_url(cls, url:str) -> 'Project': + page = BeautifulSoup(get_agent(url).content, 'lxml') + return cls.from_soup(page) + + @classmethod + def from_file(cls, path:Union[Path, str]) -> 'Project': + with open(path, 'r') as file: + html = file.read() + page = BeautifulSoup(html, 'lxml') + return cls.from_soup(page) + + def dict(self) -> dict: + return { + k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \ + if str(field._field_type) != "_FIELD_CLASSVAR" + } + + def __str__(self) -> str: + adict = self.dict() + del adict['body'] + return pformat(adict, indent=2) + + +def parse_folder(path: Path=Path('./html'), verbose=True) -> List[Project]: + """ + Parse a directory of downloaded HTML files into + :class:`.Project` objects! + + Args: + path (:class:`pathlib.Path`): Directory of downloaded html files + + Returns: + List[Project]: A list of project objects! + """ + path = Path(path) + html_files = list(path.glob('*.html')) + if verbose: + print('Parsing downloaded HTML') + pbar = tqdm(total=len(html_files)) + else: + pbar = None + + projects = [] + for f in html_files: + try: + project = Project.from_file(f) + except Exception as e: + print(f"\nException parsing {str(f)}, got {e}") + continue + + projects.append(project) + + if verbose: + pbar.update() + + return projects \ No newline at end of file diff --git a/scrape_ob/scrape.py b/scrape_ob/scrape.py index 11edc88..bbf701b 100644 --- a/scrape_ob/scrape.py +++ b/scrape_ob/scrape.py @@ -1,12 +1,16 @@ """ Scraping and iteration code """ +from pathlib import Path + +from tqdm import tqdm + from scrape_ob import OB_ROOT from bs4 import BeautifulSoup -from typing import List +from typing import List, Union -from scrape_ob.parse import Category, Project -from scrape_ob.util import get_agent +from scrape_ob.parse import Project +from scrape_ob.util import get_agent, project_name def get_root(root_url:str=OB_ROOT) -> BeautifulSoup: @@ -23,4 +27,62 @@ def list_projects(category_url:str) -> List[str]: category = BeautifulSoup(category.content, 'lxml') projects = category.select(Project.selectors.index) - return [p.get('href') for p in projects] \ No newline at end of file + return [p.get('href') for p in projects] + +def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]: + """ + Wraps the other scraping functions to return a list of all projects + in all categories as a flat list + + Args: + root_url: Root of the 'open source projects' page + + Returns: + list[str]: List of project page URLs + """ + root = get_root(root_url) + categories = list_category_urls(root) + projects = [] + for category in categories: + projects.extend(list_projects(category)) + if dedupe: + projects = list(set(projects)) + return projects + + +def download_all( + root_url:str=OB_ROOT, + output_folder:Union[Path,str]=Path('./html'), + verbose:bool=True +): + """ + Download all OB pages, saving raw HTML to disk. + + Args: + root_url (str): Root of the open source tools page + output_folder (:class:`pathlib.Path`): Directory to save HTML sources to + verbose (bool): Print status messages and progress bars + """ + output_folder = Path(output_folder) + output_folder.mkdir(exist_ok=True) + + # list all project urls + projects = list_all_projects(root_url) + + # avoid downloading files that we already have + existing_files = [p.name for p in output_folder.glob("*.html")] + filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files] + if verbose: + print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}') + pbar = tqdm(total=len(filtered_projects)) + else: + pbar = None + + # Download everything to .html files! + for project in filtered_projects: + page = get_agent(project) + output_file = output_folder / (project_name(project) + '.html') + with open(output_file, 'wb') as output: + output.write(page.content) + if verbose: + pbar.update() \ No newline at end of file diff --git a/scrape_ob/util.py b/scrape_ob/util.py index 7c53197..eb51091 100644 --- a/scrape_ob/util.py +++ b/scrape_ob/util.py @@ -1,4 +1,5 @@ from random import choice +from urllib.parse import urlsplit import requests @@ -17,4 +18,22 @@ def get_agent(url, **kwargs) -> requests.Response: url, **kwargs ) - return gotten \ No newline at end of file + return gotten + + +def project_name(url:str) -> str: + """ + Get the project name from the URL of an OB page + + Examples: + 'https://edspace.american.edu/openbehavior/project/ethoscopes/' + becomes + 'ethoscopes' + + Args: + url: URL of project page + + Returns: + str + """ + return urlsplit(url).path.strip('/').split('/')[-1] \ No newline at end of file