From 0c1d340541426127fad9126ca9a4fb1d287b732b Mon Sep 17 00:00:00 2001
From: sneakers-the-rat <JLSaunders987@gmail.com>
Date: Sat, 29 Oct 2022 20:08:02 -0700
Subject: [PATCH] working version!

---
 .gitignore          |   2 +
 README.md           | 177 ++++++++++++++++++++++++++++++++++++++++++++
 poetry.lock         |  35 ++++++++-
 pyproject.toml      |   6 +-
 scrape_ob/main.py   |  61 ++++++++++++++-
 scrape_ob/parse.py  | 164 +++++++++++++++++++++++++++++++++++-----
 scrape_ob/scrape.py |  70 +++++++++++++++++-
 scrape_ob/util.py   |  21 +++++-
 8 files changed, 510 insertions(+), 26 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1086f56
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+html
+__pycache__
diff --git a/README.md b/README.md
index 6be2902..a028c14 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,179 @@
 # scrape_ob
 
+Scrape open behavior and convert to structured data
+
+## Components
+
+- `scrape.py` - utility functions for listing and downloading files
+- `parse.py` - Parse downloaded HTML files
+- `main.py` - Wrapper/entrypoint scripts
+
+## Usage
+
+```
+>>> scrape_ob --help
+usage: scrape_ob [-h] [-u URL] [-o OUTPUT] [-d]
+
+Scrape Open Behavior and return structured data
+
+options:
+  -h, --help            show this help message and exit
+  -u URL, --url URL     Root URL for open behavior's open source project directory. Default is https://edspace.american.edu/openbehavior/open-source-tools/
+  -o OUTPUT, --output OUTPUT
+                        Output directory to store downloaded html files in. Default is ./html
+  -d, --download        Just download html files without parsing them
+
+Be kind and always give credit where labor has been done
+
+```
+
+### Downloading
+
+We try and always work on local copies of the files so we are not requesting too much from their server, so first
+we start by downloading html copies of their project descriptions. Say to the `./html` folder
+
+Interactively:
+```python
+from scrape_ob.scrape import download_all
+download_all(output_folder='./html')
+```
+
+From the cli
+```json
+scrape_ob --download
+```
+
+### Parsing
+
+From there we can parse an individual project's HTML representation into a
+structured one using two primary dataclasses, 
+
+* `Project`: The primary representation of a project
+* `Blurb`: The image and link boxes at the bottom of a page that link to 
+  further details about a project. Since these don't have structure information,
+  and will use different terminology, embed links within text, etc. we are leaving them
+  as relatively unprocessed for now, pending further refinement of the parsing classes
+
+The Project class can parse projects given a url or a file, eg.
+
+```python
+from scrape_ob.parse import Project
+
+ap = Project.from_file('html/autopilot.html')
+```
+
+Which gives us a structured representation of the project, which
+we can access directly as attributes or pull out as a dictionary like:
+
+(omitting the `body` attribute for clarity)
+```python
+>>> print(ap.dict())
+
+{
+    'name': 'AutoPilot: python framework for behavior experiments with raspberry pi',
+    'url': 'https://edspace.american.edu/openbehavior/project/autopilot/',
+    'date': datetime.datetime(2019, 12, 12, 0, 0),
+    'tags': [
+        'automated',
+        'cognition',
+        'decision-making',
+        'gui',
+        'hardware',
+        'perceptual',
+        'raspberrypi',
+        'sensory',
+        'software'
+    ],
+    'categories': [
+        'behavior-apparatus',
+        'behavior-measurement',
+        'data-analysis-software',
+        'behavior-rigs',
+        'behavior-analysis',
+        'behavioral-tasks',
+        'freely-moving',
+        'integrated-systems',
+        'stimuli'
+    ],
+    'rrids': ['SCR_021448', 'SCR_021518'],
+    'blurbs': [
+        { 
+            'links': [
+                'https://www.biorxiv.org/content/10.1101/807693v1'
+            ],
+            'name': 'Paper',
+            'type': None},
+        { 
+            'links': [ 
+                'https://github.com/wehr-lab/autopilot',
+                'http://docs.auto-pi-lot.com/'
+            ],
+            'name': 'Github',
+            'type': None
+        },
+        { 
+            'links': [ 
+                'https://auto-pi-lot.com/',
+                'https://auto-pi-lot.com/presentation/#/'
+            ],
+            'name': 'Website',
+            'type': None
+        }
+    ],
+    'docs': None,
+    'repo': None,
+    'paper': None
+}
+```
+
+Note how we are able to pull out the "category" and "tags" information which
+is usually hidden as part of the page metadata. 
+
+The extra `docs`, `repo`, and `paper` fields are currently left unfilled, as we 
+will eventually use the `Blurb` class to parse out that information
+
+The body of the project description is extracted into `body` and converted to 
+markdown :)
+
+```markdown
+Jonny Saunders from Michael Wehr’s lab at the University of Oregon
+recently posted a preprint documenting their project Autopilot, which is
+a python framework for running behavioral experiments:
+
+------------------------------------------------------------------------
+
+[Autopilot](https://auto-pi-lot.com/)\xa0is a python framework for
+behavioral experiments through utilizing\xa0[Raspberry Pi
+microcontrollers](https://www.raspberrypi.org/). Autopilot incorporates
+all aspects of an experiment, including the hardware, stimuli,
+behavioral task paradigm, data management, data visualization, and a
+user interface. The authors propose that Autopilot is the fastest, least
+expensive, most flexibile behavioral system that is currently available.
+
+The benefit of using Autopilot is that it allows more experimental
+flexibility, which lets researchers to optimize it for their specific
+experimental needs. Additionally, this project exemplifies how useful a
+raspberry pi can be for performing experiments and recording data. The
+preprint discusses many benefits of raspberry pis, including their
+speed, precision and proper data logging, and they only cost $35 (!!).
+Ultimately, the authors developed Autopilot in an effort to encourage
+users to write reusable, portable experiments that is put into a public
+central library to push replication and reproducibility.
+
+*This research tool was created by your colleagues. Please acknowledge
+the Principal Investigator, cite the article in which the tool was
+described, and include an\xa0RRID\xa0in the Materials and Methods of your
+future publications.\xa0\xa0Project portal\xa0RRID:SCR_021448;
+Software\xa0RRID:SCR_021518*
+```
+
+The utility function `parse_folder` can be used to parse the entire downloaded
+folder of html documents!
+
+
+## TODO
+
+- Complete parsing of blurbs
+- Export `Project` to structured markdown with YAML headers or to whatever format
+  open neuro ppl want
+- Make mediawiki template and client code to push into the open neuro wiki
diff --git a/poetry.lock b/poetry.lock
index 013485d..73ab65c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -32,6 +32,14 @@ python-versions = ">=3.6.0"
 [package.extras]
 unicode-backport = ["unicodedata2"]
 
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -96,6 +104,23 @@ category = "main"
 optional = false
 python-versions = ">=3.6"
 
+[[package]]
+name = "tqdm"
+version = "4.64.1"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
 [[package]]
 name = "urllib3"
 version = "1.26.12"
@@ -112,7 +137,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ca14c6295aaa4989085d6b4bd70b5e9d8299641e61f4e0b16d068d7c02a182bc"
+content-hash = "bb58291e260d36ddcb0818ccf9b03cbe3e3bef25241fa3366fcfb18294720cf3"
 
 [metadata.files]
 beautifulsoup4 = [
@@ -127,6 +152,10 @@ charset-normalizer = [
     {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"},
     {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"},
 ]
+colorama = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
 idna = [
     {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
     {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
@@ -218,6 +247,10 @@ soupsieve = [
     {file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"},
     {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"},
 ]
+tqdm = [
+    {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"},
+    {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"},
+]
 urllib3 = [
     {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"},
     {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"},
diff --git a/pyproject.toml b/pyproject.toml
index 13e875c..c3e7ddf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,9 @@
 [tool.poetry]
 name = "scrape-ob"
 version = "0.1.0"
-description = ""
+description = "Scrape Open Behavior repository and return structured data"
 authors = ["sneakers-the-rat <JLSaunders987@gmail.com>"]
+repository = "https://git.jon-e.net/jonny/scrape_ob"
 license = "AGPL-3.0"
 readme = "README.md"
 packages = [{include = "scrape_ob"}]
@@ -14,7 +15,10 @@ beautifulsoup4 = "^4.11.1"
 lxml = "^4.9.1"
 parse = "^1.19.0"
 pypandoc = "^1.9"
+tqdm = "^4.64.1"
 
+[tool.poetry.scripts]
+scrape_ob = 'scrape_ob.main:main'
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/scrape_ob/main.py b/scrape_ob/main.py
index c92054f..5c36bb6 100644
--- a/scrape_ob/main.py
+++ b/scrape_ob/main.py
@@ -1,3 +1,60 @@
 """
-User-facing runtime functions
-"""
\ No newline at end of file
+User-facing runtime functions.
+
+We try and be kind by keeping and caching local copies of the HTML
+instead of hitting the server every time, so we should always start with
+the `download_all` function, which will skip any project files
+that we already have downloaded in the output_folder
+"""
+from pathlib import Path
+import argparse
+
+from scrape_ob.scrape import download_all
+from scrape_ob.constants import OB_ROOT
+from scrape_ob.parse import Project, parse_folder
+from typing import List, Optional
+
+
+def argparser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="scrape_ob",
+        description="Scrape Open Behavior and return structured data",
+        epilog="Be kind and always give credit where labor has been done"
+    )
+    parser.add_argument(
+        '-u', '--url', default=OB_ROOT, required=False,
+        help=f"Root URL for open behavior's open source project directory. Default is {OB_ROOT}"
+    )
+    parser.add_argument(
+        '-o', '--output', default='./html', required=False, type=Path,
+        help="Output directory to store downloaded html files in. Default is ./html"
+    )
+    parser.add_argument(
+        '-d', '--download', action='store_true',
+        help="Just download html files without parsing them"
+    )
+
+    return parser
+
+
+def main() -> Optional[List[Project]]:
+    parser = argparser()
+    args = parser.parse_args()
+
+    output = Path(args.output)
+
+    if args.download:
+        print('Just downloading files without parsing')
+
+    download_all(root_url=args.url, output_folder=output)
+
+    if args.download:
+        # just download, don't parse
+        return None
+
+    projects = parse_folder(path=output)
+    return projects
+
+
+
+
diff --git a/scrape_ob/parse.py b/scrape_ob/parse.py
index 89e5747..5bb1ac8 100644
--- a/scrape_ob/parse.py
+++ b/scrape_ob/parse.py
@@ -1,27 +1,73 @@
 """
 Models and parsers for OB pages
 """
-from typing import ClassVar, List, TypedDict
+import re
+from typing import ClassVar, List, TypedDict, Optional, Literal, TYPE_CHECKING, Union
 from dataclasses import dataclass
 from datetime import datetime
+from pathlib import Path
+from pprint import pformat
+
 from bs4 import BeautifulSoup
+from bs4.element import Tag
+
 from parse import parse as _parse
+from pypandoc import convert_text
+from tqdm import tqdm
 
 from scrape_ob.util import get_agent
 
-@dataclass
-class Element:
-    selector:ClassVar[str]
-    soup:BeautifulSoup
 
-
-    @classmethod
-    def from_soup(cls, soup:BeautifulSoup) -> 'Element':
-        return cls(soup=soup.select(cls.selector))
+BLURB_TYPES = Literal['docs', 'paper', 'repo', 'homepage']
 
 class Category:
     selector = ".et_pb_module_header a"
 
+@dataclass
+class Blurb:
+    SELECTOR: ClassVar[str] = '.et_pb_blurb_content'
+    HEADER: ClassVar[str] = 'h4'
+    DESCRIPTION: ClassVar[str] = '.et_pb_blurb_description'
+
+    name: str
+    body: Tag
+    description: Tag
+    links: List[str]
+    type: Optional[BLURB_TYPES] = None
+
+    @classmethod
+    def from_soup(cls, blurb) -> 'Blurb':
+        """
+        From the bs4 Tag element that contains the blurb
+        """
+        body = blurb
+        name = blurb.find(cls.HEADER).text
+        description = blurb.select(cls.DESCRIPTION)[0]
+        links = list(set([a.get('href') for a in blurb.find_all('a')]))
+        return Blurb(
+            body = body,
+            name = name,
+            description = description,
+            links = links
+        )
+
+    def dict(self) -> dict:
+        return {
+            k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
+            if str(field._field_type) != "_FIELD_CLASSVAR"
+        }
+
+    def __str__(self) -> str:
+        adict = self.dict()
+        del adict['body']
+        del adict['description']
+        return pformat(adict, indent=2)
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+
 @dataclass
 class Project:
     class selectors:
@@ -29,20 +75,28 @@ class Project:
         article = "article"
         date = "span.published"
         body = ".et_pb_text_inner"
-        blurb = ".et_pb_blurb_content .et_pb_module_header a"
+        name = '.entry-title'
 
     class patterns:
         category = "project_category-{}"
         tag = "project_tag-{}"
+        rrid = re.compile(r"RRID:\s{0,2}([\w\d_]*)")
 
     DATE_FMT: ClassVar[str] = "%b %d, %Y"
     """eg. Oct 20, 2022"""
 
     name: str
     url: str
+    body: str
     date: datetime
     tags: List[str]
     categories: List[str]
+    rrids: List[str]
+    blurbs: List[Blurb]
+    docs: Optional[str] = None
+    repo: Optional[str] = None
+    paper: Optional[str] = None
+
 
     @classmethod
     def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]:
@@ -50,16 +104,92 @@ class Project:
         return [t[0] for t in tags if t is not None]
 
     @classmethod
-    def from_url(cls, url) -> 'Project':
-        page = BeautifulSoup(get_agent(url).content, 'lxml')
-
-        classes = page.select(cls.selectors.article)[0].get('class')
-        tags = cls.parse_pattern(classes, cls.patterns.tag)
-        categories = cls.parse_pattern(classes, cls.patterns.category)
+    def from_soup(cls, page:BeautifulSoup) -> 'Project':
+        name = page.select(cls.selectors.name)[0].text
         date = datetime.strptime(
             page.select(cls.selectors.date)[0].text,
             cls.DATE_FMT
         )
+        url = page.find('link', rel="canonical").get('href')
 
+        # parse tag & category metadata
+        classes = page.select(cls.selectors.article)[0].get('class')
+        tags = cls.parse_pattern(classes, cls.patterns.tag)
+        categories = cls.parse_pattern(classes, cls.patterns.category)
+
+        # parse body text
         body = page.select(cls.selectors.body)[0]
-        # TODO Parse HTML to markdown
+        body_markdown = convert_text(body, to="commonmark-raw_html", format="html")
+        rrids = cls.patterns.rrid.findall(body_markdown)
+
+        # parse blurbs (links to documentation, etc.)
+        blurbs = [Blurb.from_soup(blurb) for blurb in page.select(Blurb.SELECTOR)]
+
+        return Project(
+            name = name,
+            date = date,
+            url = url,
+            body = body_markdown,
+            tags = tags,
+            categories = categories,
+            rrids = rrids,
+            blurbs = blurbs
+        )
+
+    @classmethod
+    def from_url(cls, url:str) -> 'Project':
+        page = BeautifulSoup(get_agent(url).content, 'lxml')
+        return cls.from_soup(page)
+
+    @classmethod
+    def from_file(cls, path:Union[Path, str]) -> 'Project':
+        with open(path, 'r') as file:
+            html = file.read()
+        page = BeautifulSoup(html, 'lxml')
+        return cls.from_soup(page)
+
+    def dict(self) -> dict:
+        return {
+            k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
+            if str(field._field_type) != "_FIELD_CLASSVAR"
+        }
+
+    def __str__(self) -> str:
+        adict = self.dict()
+        del adict['body']
+        return pformat(adict, indent=2)
+
+
+def parse_folder(path: Path=Path('./html'), verbose=True) -> List[Project]:
+    """
+    Parse a directory of downloaded HTML files into
+    :class:`.Project` objects!
+
+    Args:
+        path (:class:`pathlib.Path`): Directory of downloaded html files
+
+    Returns:
+        List[Project]: A list of project objects!
+    """
+    path = Path(path)
+    html_files = list(path.glob('*.html'))
+    if verbose:
+        print('Parsing downloaded HTML')
+        pbar = tqdm(total=len(html_files))
+    else:
+        pbar = None
+
+    projects = []
+    for f in html_files:
+        try:
+            project = Project.from_file(f)
+        except Exception as e:
+            print(f"\nException parsing {str(f)}, got {e}")
+            continue
+
+        projects.append(project)
+
+        if verbose:
+            pbar.update()
+
+    return projects
\ No newline at end of file
diff --git a/scrape_ob/scrape.py b/scrape_ob/scrape.py
index 11edc88..bbf701b 100644
--- a/scrape_ob/scrape.py
+++ b/scrape_ob/scrape.py
@@ -1,12 +1,16 @@
 """
 Scraping and iteration code
 """
+from pathlib import Path
+
+from tqdm import tqdm
+
 from scrape_ob import OB_ROOT
 from bs4 import BeautifulSoup
-from typing import List
+from typing import List, Union
 
-from scrape_ob.parse import Category, Project
-from scrape_ob.util import get_agent
+from scrape_ob.parse import  Project
+from scrape_ob.util import get_agent, project_name
 
 
 def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
@@ -23,4 +27,62 @@ def list_projects(category_url:str) -> List[str]:
     category = BeautifulSoup(category.content, 'lxml')
 
     projects = category.select(Project.selectors.index)
-    return [p.get('href') for p in projects]
\ No newline at end of file
+    return [p.get('href') for p in projects]
+
+def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]:
+    """
+    Wraps the other scraping functions to return a list of all projects
+    in all categories as a flat list
+
+    Args:
+        root_url: Root of the 'open source projects' page
+
+    Returns:
+        list[str]: List of project page URLs
+    """
+    root = get_root(root_url)
+    categories = list_category_urls(root)
+    projects = []
+    for category in categories:
+        projects.extend(list_projects(category))
+    if dedupe:
+        projects = list(set(projects))
+    return projects
+
+
+def download_all(
+        root_url:str=OB_ROOT,
+        output_folder:Union[Path,str]=Path('./html'),
+        verbose:bool=True
+):
+    """
+    Download all OB pages, saving raw HTML to disk.
+
+    Args:
+        root_url (str): Root of the open source tools page
+        output_folder (:class:`pathlib.Path`): Directory to save HTML sources to
+        verbose (bool): Print status messages and progress bars
+    """
+    output_folder = Path(output_folder)
+    output_folder.mkdir(exist_ok=True)
+
+    # list all project urls
+    projects = list_all_projects(root_url)
+
+    # avoid downloading files that we already have
+    existing_files = [p.name for p in output_folder.glob("*.html")]
+    filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files]
+    if verbose:
+        print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}')
+        pbar = tqdm(total=len(filtered_projects))
+    else:
+        pbar = None
+
+    # Download everything to .html files!
+    for project in filtered_projects:
+        page = get_agent(project)
+        output_file = output_folder / (project_name(project) + '.html')
+        with open(output_file, 'wb') as output:
+            output.write(page.content)
+        if verbose:
+            pbar.update()
\ No newline at end of file
diff --git a/scrape_ob/util.py b/scrape_ob/util.py
index 7c53197..eb51091 100644
--- a/scrape_ob/util.py
+++ b/scrape_ob/util.py
@@ -1,4 +1,5 @@
 from random import choice
+from urllib.parse import urlsplit
 
 import requests
 
@@ -17,4 +18,22 @@ def get_agent(url, **kwargs) -> requests.Response:
         url,
         **kwargs
     )
-    return gotten
\ No newline at end of file
+    return gotten
+
+
+def project_name(url:str) -> str:
+    """
+    Get the project name from the URL of an OB page
+
+    Examples:
+        'https://edspace.american.edu/openbehavior/project/ethoscopes/'
+        becomes
+        'ethoscopes'
+
+    Args:
+        url: URL of project page
+
+    Returns:
+        str
+    """
+    return urlsplit(url).path.strip('/').split('/')[-1]
\ No newline at end of file