working version!

2022-10-29 20:08:02 -07:00 · 2022-10-29 20:08:02 -07:00 · 0c1d340541
commit 0c1d340541
parent bb6775d14e
8 changed files with 510 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+html
+__pycache__
--- a/README.md
+++ b/README.md
@ -1,2 +1,179 @@
 # scrape_ob

+Scrape open behavior and convert to structured data
+
+## Components
+
+- `scrape.py` - utility functions for listing and downloading files
+- `parse.py` - Parse downloaded HTML files
+- `main.py` - Wrapper/entrypoint scripts
+
+## Usage
+
+```
+>>> scrape_ob --help
+usage: scrape_ob [-h] [-u URL] [-o OUTPUT] [-d]
+
+Scrape Open Behavior and return structured data
+
+options:
+  -h, --help            show this help message and exit
+  -u URL, --url URL     Root URL for open behavior's open source project directory. Default is https://edspace.american.edu/openbehavior/open-source-tools/
+  -o OUTPUT, --output OUTPUT
+                        Output directory to store downloaded html files in. Default is ./html
+  -d, --download        Just download html files without parsing them
+
+Be kind and always give credit where labor has been done
+
+```
+
+### Downloading
+
+We try and always work on local copies of the files so we are not requesting too much from their server, so first
+we start by downloading html copies of their project descriptions. Say to the `./html` folder
+
+Interactively:
+```python
+from scrape_ob.scrape import download_all
+download_all(output_folder='./html')
+```
+
+From the cli
+```json
+scrape_ob --download
+```
+
+### Parsing
+
+From there we can parse an individual project's HTML representation into a
+structured one using two primary dataclasses, 
+
+* `Project`: The primary representation of a project
+* `Blurb`: The image and link boxes at the bottom of a page that link to 
+  further details about a project. Since these don't have structure information,
+  and will use different terminology, embed links within text, etc. we are leaving them
+  as relatively unprocessed for now, pending further refinement of the parsing classes
+
+The Project class can parse projects given a url or a file, eg.
+
+```python
+from scrape_ob.parse import Project
+
+ap = Project.from_file('html/autopilot.html')
+```
+
+Which gives us a structured representation of the project, which
+we can access directly as attributes or pull out as a dictionary like:
+
+(omitting the `body` attribute for clarity)
+```python
+>>> print(ap.dict())
+
+{
+    'name': 'AutoPilot: python framework for behavior experiments with raspberry pi',
+    'url': 'https://edspace.american.edu/openbehavior/project/autopilot/',
+    'date': datetime.datetime(2019, 12, 12, 0, 0),
+    'tags': [
+        'automated',
+        'cognition',
+        'decision-making',
+        'gui',
+        'hardware',
+        'perceptual',
+        'raspberrypi',
+        'sensory',
+        'software'
+    ],
+    'categories': [
+        'behavior-apparatus',
+        'behavior-measurement',
+        'data-analysis-software',
+        'behavior-rigs',
+        'behavior-analysis',
+        'behavioral-tasks',
+        'freely-moving',
+        'integrated-systems',
+        'stimuli'
+    ],
+    'rrids': ['SCR_021448', 'SCR_021518'],
+    'blurbs': [
+        { 
+            'links': [
+                'https://www.biorxiv.org/content/10.1101/807693v1'
+            ],
+            'name': 'Paper',
+            'type': None},
+        { 
+            'links': [ 
+                'https://github.com/wehr-lab/autopilot',
+                'http://docs.auto-pi-lot.com/'
+            ],
+            'name': 'Github',
+            'type': None
+        },
+        { 
+            'links': [ 
+                'https://auto-pi-lot.com/',
+                'https://auto-pi-lot.com/presentation/#/'
+            ],
+            'name': 'Website',
+            'type': None
+        }
+    ],
+    'docs': None,
+    'repo': None,
+    'paper': None
+}
+```
+
+Note how we are able to pull out the "category" and "tags" information which
+is usually hidden as part of the page metadata. 
+
+The extra `docs`, `repo`, and `paper` fields are currently left unfilled, as we 
+will eventually use the `Blurb` class to parse out that information
+
+The body of the project description is extracted into `body` and converted to 
+markdown :)
+
+```markdown
+Jonny Saunders from Michael Wehr’s lab at the University of Oregon
+recently posted a preprint documenting their project Autopilot, which is
+a python framework for running behavioral experiments:
+
+------------------------------------------------------------------------
+
+[Autopilot](https://auto-pi-lot.com/)\xa0is a python framework for
+behavioral experiments through utilizing\xa0[Raspberry Pi
+microcontrollers](https://www.raspberrypi.org/). Autopilot incorporates
+all aspects of an experiment, including the hardware, stimuli,
+behavioral task paradigm, data management, data visualization, and a
+user interface. The authors propose that Autopilot is the fastest, least
+expensive, most flexibile behavioral system that is currently available.
+
+The benefit of using Autopilot is that it allows more experimental
+flexibility, which lets researchers to optimize it for their specific
+experimental needs. Additionally, this project exemplifies how useful a
+raspberry pi can be for performing experiments and recording data. The
+preprint discusses many benefits of raspberry pis, including their
+speed, precision and proper data logging, and they only cost $35 (!!).
+Ultimately, the authors developed Autopilot in an effort to encourage
+users to write reusable, portable experiments that is put into a public
+central library to push replication and reproducibility.
+
+*This research tool was created by your colleagues. Please acknowledge
+the Principal Investigator, cite the article in which the tool was
+described, and include an\xa0RRID\xa0in the Materials and Methods of your
+future publications.\xa0\xa0Project portal\xa0RRID:SCR_021448;
+Software\xa0RRID:SCR_021518*
+```
+
+The utility function `parse_folder` can be used to parse the entire downloaded
+folder of html documents!
+
+
+## TODO
+
+- Complete parsing of blurbs
+- Export `Project` to structured markdown with YAML headers or to whatever format
+  open neuro ppl want
+- Make mediawiki template and client code to push into the open neuro wiki
--- a/poetry.lock
+++ b/poetry.lock
@ -32,6 +32,14 @@ python-versions = ">=3.6.0"
 [package.extras]
 unicode-backport = ["unicodedata2"]

+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+
 [[package]]
 name = "idna"
 version = "3.4"
@ -96,6 +104,23 @@ category = "main"
 optional = false
 python-versions = ">=3.6"

+[[package]]
+name = "tqdm"
+version = "4.64.1"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
 [[package]]
 name = "urllib3"
 version = "1.26.12"
@ -112,7 +137,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ca14c6295aaa4989085d6b4bd70b5e9d8299641e61f4e0b16d068d7c02a182bc"
+content-hash = "bb58291e260d36ddcb0818ccf9b03cbe3e3bef25241fa3366fcfb18294720cf3"

 [metadata.files]
 beautifulsoup4 = [
@ -127,6 +152,10 @@ charset-normalizer = [
    {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"},
    {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"},
 ]
+colorama = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
 idna = [
    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
@ -218,6 +247,10 @@ soupsieve = [
    {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"},
    {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"},
 ]
+tqdm = [
+    {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"},
+    {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"},
+]
 urllib3 = [
    {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"},
    {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,8 +1,9 @@
 [tool.poetry]
 name = "scrape-ob"
 version = "0.1.0"
-description = ""
+description = "Scrape Open Behavior repository and return structured data"
 authors = ["sneakers-the-rat <JLSaunders987@gmail.com>"]
+repository = "https://git.jon-e.net/jonny/scrape_ob"
 license = "AGPL-3.0"
 readme = "README.md"
 packages = [{include = "scrape_ob"}]
@ -14,7 +15,10 @@ beautifulsoup4 = "^4.11.1"
 lxml = "^4.9.1"
 parse = "^1.19.0"
 pypandoc = "^1.9"
+tqdm = "^4.64.1"

+[tool.poetry.scripts]
+scrape_ob = 'scrape_ob.main:main'

 [build-system]
 requires = ["poetry-core"]
--- a/scrape_ob/main.py
+++ b/scrape_ob/main.py
@ -1,3 +1,60 @@
 """
-User-facing runtime functions
-"""
+User-facing runtime functions.
+
+We try and be kind by keeping and caching local copies of the HTML
+instead of hitting the server every time, so we should always start with
+the `download_all` function, which will skip any project files
+that we already have downloaded in the output_folder
+"""
+from pathlib import Path
+import argparse
+
+from scrape_ob.scrape import download_all
+from scrape_ob.constants import OB_ROOT
+from scrape_ob.parse import Project, parse_folder
+from typing import List, Optional
+
+
+def argparser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="scrape_ob",
+        description="Scrape Open Behavior and return structured data",
+        epilog="Be kind and always give credit where labor has been done"
+    )
+    parser.add_argument(
+        '-u', '--url', default=OB_ROOT, required=False,
+        help=f"Root URL for open behavior's open source project directory. Default is {OB_ROOT}"
+    )
+    parser.add_argument(
+        '-o', '--output', default='./html', required=False, type=Path,
+        help="Output directory to store downloaded html files in. Default is ./html"
+    )
+    parser.add_argument(
+        '-d', '--download', action='store_true',
+        help="Just download html files without parsing them"
+    )
+
+    return parser
+
+
+def main() -> Optional[List[Project]]:
+    parser = argparser()
+    args = parser.parse_args()
+
+    output = Path(args.output)
+
+    if args.download:
+        print('Just downloading files without parsing')
+
+    download_all(root_url=args.url, output_folder=output)
+
+    if args.download:
+        # just download, don't parse
+        return None
+
+    projects = parse_folder(path=output)
+    return projects
+
+
+
+
--- a/scrape_ob/parse.py
+++ b/scrape_ob/parse.py
@ -1,27 +1,73 @@
 """
 Models and parsers for OB pages
 """
-from typing import ClassVar, List, TypedDict
+import re
+from typing import ClassVar, List, TypedDict, Optional, Literal, TYPE_CHECKING, Union
 from dataclasses import dataclass
 from datetime import datetime
+from pathlib import Path
+from pprint import pformat
+
 from bs4 import BeautifulSoup
+from bs4.element import Tag
+
 from parse import parse as _parse
+from pypandoc import convert_text
+from tqdm import tqdm

 from scrape_ob.util import get_agent

-@dataclass
-class Element:
-    selector:ClassVar[str]
-    soup:BeautifulSoup

-
-    @classmethod
-    def from_soup(cls, soup:BeautifulSoup) -> 'Element':
-        return cls(soup=soup.select(cls.selector))
+BLURB_TYPES = Literal['docs', 'paper', 'repo', 'homepage']

 class Category:
    selector = ".et_pb_module_header a"

+@dataclass
+class Blurb:
+    SELECTOR: ClassVar[str] = '.et_pb_blurb_content'
+    HEADER: ClassVar[str] = 'h4'
+    DESCRIPTION: ClassVar[str] = '.et_pb_blurb_description'
+
+    name: str
+    body: Tag
+    description: Tag
+    links: List[str]
+    type: Optional[BLURB_TYPES] = None
+
+    @classmethod
+    def from_soup(cls, blurb) -> 'Blurb':
+        """
+        From the bs4 Tag element that contains the blurb
+        """
+        body = blurb
+        name = blurb.find(cls.HEADER).text
+        description = blurb.select(cls.DESCRIPTION)[0]
+        links = list(set([a.get('href') for a in blurb.find_all('a')]))
+        return Blurb(
+            body = body,
+            name = name,
+            description = description,
+            links = links
+        )
+
+    def dict(self) -> dict:
+        return {
+            k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
+            if str(field._field_type) != "_FIELD_CLASSVAR"
+        }
+
+    def __str__(self) -> str:
+        adict = self.dict()
+        del adict['body']
+        del adict['description']
+        return pformat(adict, indent=2)
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+
@dataclass
 class Project:
    class selectors:
@ -29,20 +75,28 @@ class Project:
        article = "article"
        date = "span.published"
        body = ".et_pb_text_inner"
-        blurb = ".et_pb_blurb_content .et_pb_module_header a"
+        name = '.entry-title'

    class patterns:
        category = "project_category-{}"
        tag = "project_tag-{}"
+        rrid = re.compile(r"RRID:\s{0,2}([\w\d_]*)")

    DATE_FMT: ClassVar[str] = "%b %d, %Y"
    """eg. Oct 20, 2022"""

    name: str
    url: str
+    body: str
    date: datetime
    tags: List[str]
    categories: List[str]
+    rrids: List[str]
+    blurbs: List[Blurb]
+    docs: Optional[str] = None
+    repo: Optional[str] = None
+    paper: Optional[str] = None
+

    @classmethod
    def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]:
@ -50,16 +104,92 @@ class Project:
        return [t[0] for t in tags if t is not None]

    @classmethod
-    def from_url(cls, url) -> 'Project':
-        page = BeautifulSoup(get_agent(url).content, 'lxml')
-
-        classes = page.select(cls.selectors.article)[0].get('class')
-        tags = cls.parse_pattern(classes, cls.patterns.tag)
-        categories = cls.parse_pattern(classes, cls.patterns.category)
+    def from_soup(cls, page:BeautifulSoup) -> 'Project':
+        name = page.select(cls.selectors.name)[0].text
        date = datetime.strptime(
            page.select(cls.selectors.date)[0].text,
            cls.DATE_FMT
        )
+        url = page.find('link', rel="canonical").get('href')

+        # parse tag & category metadata
+        classes = page.select(cls.selectors.article)[0].get('class')
+        tags = cls.parse_pattern(classes, cls.patterns.tag)
+        categories = cls.parse_pattern(classes, cls.patterns.category)
+
+        # parse body text
        body = page.select(cls.selectors.body)[0]
-        # TODO Parse HTML to markdown
+        body_markdown = convert_text(body, to="commonmark-raw_html", format="html")
+        rrids = cls.patterns.rrid.findall(body_markdown)
+
+        # parse blurbs (links to documentation, etc.)
+        blurbs = [Blurb.from_soup(blurb) for blurb in page.select(Blurb.SELECTOR)]
+
+        return Project(
+            name = name,
+            date = date,
+            url = url,
+            body = body_markdown,
+            tags = tags,
+            categories = categories,
+            rrids = rrids,
+            blurbs = blurbs
+        )
+
+    @classmethod
+    def from_url(cls, url:str) -> 'Project':
+        page = BeautifulSoup(get_agent(url).content, 'lxml')
+        return cls.from_soup(page)
+
+    @classmethod
+    def from_file(cls, path:Union[Path, str]) -> 'Project':
+        with open(path, 'r') as file:
+            html = file.read()
+        page = BeautifulSoup(html, 'lxml')
+        return cls.from_soup(page)
+
+    def dict(self) -> dict:
+        return {
+            k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
+            if str(field._field_type) != "_FIELD_CLASSVAR"
+        }
+
+    def __str__(self) -> str:
+        adict = self.dict()
+        del adict['body']
+        return pformat(adict, indent=2)
+
+
+def parse_folder(path: Path=Path('./html'), verbose=True) -> List[Project]:
+    """
+    Parse a directory of downloaded HTML files into
+    :class:`.Project` objects!
+
+    Args:
+        path (:class:`pathlib.Path`): Directory of downloaded html files
+
+    Returns:
+        List[Project]: A list of project objects!
+    """
+    path = Path(path)
+    html_files = list(path.glob('*.html'))
+    if verbose:
+        print('Parsing downloaded HTML')
+        pbar = tqdm(total=len(html_files))
+    else:
+        pbar = None
+
+    projects = []
+    for f in html_files:
+        try:
+            project = Project.from_file(f)
+        except Exception as e:
+            print(f"\nException parsing {str(f)}, got {e}")
+            continue
+
+        projects.append(project)
+
+        if verbose:
+            pbar.update()
+
+    return projects
--- a/scrape_ob/scrape.py
+++ b/scrape_ob/scrape.py
@ -1,12 +1,16 @@
 """
 Scraping and iteration code
 """
+from pathlib import Path
+
+from tqdm import tqdm
+
 from scrape_ob import OB_ROOT
 from bs4 import BeautifulSoup
-from typing import List
+from typing import List, Union

-from scrape_ob.parse import Category, Project
-from scrape_ob.util import get_agent
+from scrape_ob.parse import  Project
+from scrape_ob.util import get_agent, project_name


 def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
@ -23,4 +27,62 @@ def list_projects(category_url:str) -> List[str]:
    category = BeautifulSoup(category.content, 'lxml')

    projects = category.select(Project.selectors.index)
-    return [p.get('href') for p in projects]
+    return [p.get('href') for p in projects]
+
+def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]:
+    """
+    Wraps the other scraping functions to return a list of all projects
+    in all categories as a flat list
+
+    Args:
+        root_url: Root of the 'open source projects' page
+
+    Returns:
+        list[str]: List of project page URLs
+    """
+    root = get_root(root_url)
+    categories = list_category_urls(root)
+    projects = []
+    for category in categories:
+        projects.extend(list_projects(category))
+    if dedupe:
+        projects = list(set(projects))
+    return projects
+
+
+def download_all(
+        root_url:str=OB_ROOT,
+        output_folder:Union[Path,str]=Path('./html'),
+        verbose:bool=True
+):
+    """
+    Download all OB pages, saving raw HTML to disk.
+
+    Args:
+        root_url (str): Root of the open source tools page
+        output_folder (:class:`pathlib.Path`): Directory to save HTML sources to
+        verbose (bool): Print status messages and progress bars
+    """
+    output_folder = Path(output_folder)
+    output_folder.mkdir(exist_ok=True)
+
+    # list all project urls
+    projects = list_all_projects(root_url)
+
+    # avoid downloading files that we already have
+    existing_files = [p.name for p in output_folder.glob("*.html")]
+    filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files]
+    if verbose:
+        print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}')
+        pbar = tqdm(total=len(filtered_projects))
+    else:
+        pbar = None
+
+    # Download everything to .html files!
+    for project in filtered_projects:
+        page = get_agent(project)
+        output_file = output_folder / (project_name(project) + '.html')
+        with open(output_file, 'wb') as output:
+            output.write(page.content)
+        if verbose:
+            pbar.update()
--- a/scrape_ob/util.py
+++ b/scrape_ob/util.py
@ -1,4 +1,5 @@
 from random import choice
+from urllib.parse import urlsplit

 import requests

@ -17,4 +18,22 @@ def get_agent(url, **kwargs) -> requests.Response:
        url,
        **kwargs
    )
-    return gotten
+    return gotten
+
+
+def project_name(url:str) -> str:
+    """
+    Get the project name from the URL of an OB page
+
+    Examples:
+        'https://edspace.american.edu/openbehavior/project/ethoscopes/'
+        becomes
+        'ethoscopes'
+
+    Args:
+        url: URL of project page
+
+    Returns:
+        str
+    """
+    return urlsplit(url).path.strip('/').split('/')[-1]