working version!
This commit is contained in:
parent
bb6775d14e
commit
0c1d340541
8 changed files with 510 additions and 26 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
html
|
||||||
|
__pycache__
|
177
README.md
177
README.md
|
@ -1,2 +1,179 @@
|
||||||
# scrape_ob
|
# scrape_ob
|
||||||
|
|
||||||
|
Scrape open behavior and convert to structured data
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
- `scrape.py` - utility functions for listing and downloading files
|
||||||
|
- `parse.py` - Parse downloaded HTML files
|
||||||
|
- `main.py` - Wrapper/entrypoint scripts
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
>>> scrape_ob --help
|
||||||
|
usage: scrape_ob [-h] [-u URL] [-o OUTPUT] [-d]
|
||||||
|
|
||||||
|
Scrape Open Behavior and return structured data
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-u URL, --url URL Root URL for open behavior's open source project directory. Default is https://edspace.american.edu/openbehavior/open-source-tools/
|
||||||
|
-o OUTPUT, --output OUTPUT
|
||||||
|
Output directory to store downloaded html files in. Default is ./html
|
||||||
|
-d, --download Just download html files without parsing them
|
||||||
|
|
||||||
|
Be kind and always give credit where labor has been done
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Downloading
|
||||||
|
|
||||||
|
We try and always work on local copies of the files so we are not requesting too much from their server, so first
|
||||||
|
we start by downloading html copies of their project descriptions. Say to the `./html` folder
|
||||||
|
|
||||||
|
Interactively:
|
||||||
|
```python
|
||||||
|
from scrape_ob.scrape import download_all
|
||||||
|
download_all(output_folder='./html')
|
||||||
|
```
|
||||||
|
|
||||||
|
From the cli
|
||||||
|
```json
|
||||||
|
scrape_ob --download
|
||||||
|
```
|
||||||
|
|
||||||
|
### Parsing
|
||||||
|
|
||||||
|
From there we can parse an individual project's HTML representation into a
|
||||||
|
structured one using two primary dataclasses,
|
||||||
|
|
||||||
|
* `Project`: The primary representation of a project
|
||||||
|
* `Blurb`: The image and link boxes at the bottom of a page that link to
|
||||||
|
further details about a project. Since these don't have structure information,
|
||||||
|
and will use different terminology, embed links within text, etc. we are leaving them
|
||||||
|
as relatively unprocessed for now, pending further refinement of the parsing classes
|
||||||
|
|
||||||
|
The Project class can parse projects given a url or a file, eg.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from scrape_ob.parse import Project
|
||||||
|
|
||||||
|
ap = Project.from_file('html/autopilot.html')
|
||||||
|
```
|
||||||
|
|
||||||
|
Which gives us a structured representation of the project, which
|
||||||
|
we can access directly as attributes or pull out as a dictionary like:
|
||||||
|
|
||||||
|
(omitting the `body` attribute for clarity)
|
||||||
|
```python
|
||||||
|
>>> print(ap.dict())
|
||||||
|
|
||||||
|
{
|
||||||
|
'name': 'AutoPilot: python framework for behavior experiments with raspberry pi',
|
||||||
|
'url': 'https://edspace.american.edu/openbehavior/project/autopilot/',
|
||||||
|
'date': datetime.datetime(2019, 12, 12, 0, 0),
|
||||||
|
'tags': [
|
||||||
|
'automated',
|
||||||
|
'cognition',
|
||||||
|
'decision-making',
|
||||||
|
'gui',
|
||||||
|
'hardware',
|
||||||
|
'perceptual',
|
||||||
|
'raspberrypi',
|
||||||
|
'sensory',
|
||||||
|
'software'
|
||||||
|
],
|
||||||
|
'categories': [
|
||||||
|
'behavior-apparatus',
|
||||||
|
'behavior-measurement',
|
||||||
|
'data-analysis-software',
|
||||||
|
'behavior-rigs',
|
||||||
|
'behavior-analysis',
|
||||||
|
'behavioral-tasks',
|
||||||
|
'freely-moving',
|
||||||
|
'integrated-systems',
|
||||||
|
'stimuli'
|
||||||
|
],
|
||||||
|
'rrids': ['SCR_021448', 'SCR_021518'],
|
||||||
|
'blurbs': [
|
||||||
|
{
|
||||||
|
'links': [
|
||||||
|
'https://www.biorxiv.org/content/10.1101/807693v1'
|
||||||
|
],
|
||||||
|
'name': 'Paper',
|
||||||
|
'type': None},
|
||||||
|
{
|
||||||
|
'links': [
|
||||||
|
'https://github.com/wehr-lab/autopilot',
|
||||||
|
'http://docs.auto-pi-lot.com/'
|
||||||
|
],
|
||||||
|
'name': 'Github',
|
||||||
|
'type': None
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'links': [
|
||||||
|
'https://auto-pi-lot.com/',
|
||||||
|
'https://auto-pi-lot.com/presentation/#/'
|
||||||
|
],
|
||||||
|
'name': 'Website',
|
||||||
|
'type': None
|
||||||
|
}
|
||||||
|
],
|
||||||
|
'docs': None,
|
||||||
|
'repo': None,
|
||||||
|
'paper': None
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Note how we are able to pull out the "category" and "tags" information which
|
||||||
|
is usually hidden as part of the page metadata.
|
||||||
|
|
||||||
|
The extra `docs`, `repo`, and `paper` fields are currently left unfilled, as we
|
||||||
|
will eventually use the `Blurb` class to parse out that information
|
||||||
|
|
||||||
|
The body of the project description is extracted into `body` and converted to
|
||||||
|
markdown :)
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
Jonny Saunders from Michael Wehr’s lab at the University of Oregon
|
||||||
|
recently posted a preprint documenting their project Autopilot, which is
|
||||||
|
a python framework for running behavioral experiments:
|
||||||
|
|
||||||
|
------------------------------------------------------------------------
|
||||||
|
|
||||||
|
[Autopilot](https://auto-pi-lot.com/)\xa0is a python framework for
|
||||||
|
behavioral experiments through utilizing\xa0[Raspberry Pi
|
||||||
|
microcontrollers](https://www.raspberrypi.org/). Autopilot incorporates
|
||||||
|
all aspects of an experiment, including the hardware, stimuli,
|
||||||
|
behavioral task paradigm, data management, data visualization, and a
|
||||||
|
user interface. The authors propose that Autopilot is the fastest, least
|
||||||
|
expensive, most flexibile behavioral system that is currently available.
|
||||||
|
|
||||||
|
The benefit of using Autopilot is that it allows more experimental
|
||||||
|
flexibility, which lets researchers to optimize it for their specific
|
||||||
|
experimental needs. Additionally, this project exemplifies how useful a
|
||||||
|
raspberry pi can be for performing experiments and recording data. The
|
||||||
|
preprint discusses many benefits of raspberry pis, including their
|
||||||
|
speed, precision and proper data logging, and they only cost $35 (!!).
|
||||||
|
Ultimately, the authors developed Autopilot in an effort to encourage
|
||||||
|
users to write reusable, portable experiments that is put into a public
|
||||||
|
central library to push replication and reproducibility.
|
||||||
|
|
||||||
|
*This research tool was created by your colleagues. Please acknowledge
|
||||||
|
the Principal Investigator, cite the article in which the tool was
|
||||||
|
described, and include an\xa0RRID\xa0in the Materials and Methods of your
|
||||||
|
future publications.\xa0\xa0Project portal\xa0RRID:SCR_021448;
|
||||||
|
Software\xa0RRID:SCR_021518*
|
||||||
|
```
|
||||||
|
|
||||||
|
The utility function `parse_folder` can be used to parse the entire downloaded
|
||||||
|
folder of html documents!
|
||||||
|
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- Complete parsing of blurbs
|
||||||
|
- Export `Project` to structured markdown with YAML headers or to whatever format
|
||||||
|
open neuro ppl want
|
||||||
|
- Make mediawiki template and client code to push into the open neuro wiki
|
||||||
|
|
35
poetry.lock
generated
35
poetry.lock
generated
|
@ -32,6 +32,14 @@ python-versions = ">=3.6.0"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
unicode-backport = ["unicodedata2"]
|
unicode-backport = ["unicodedata2"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorama"
|
||||||
|
version = "0.4.6"
|
||||||
|
description = "Cross-platform colored terminal text."
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.4"
|
version = "3.4"
|
||||||
|
@ -96,6 +104,23 @@ category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tqdm"
|
||||||
|
version = "4.64.1"
|
||||||
|
description = "Fast, Extensible Progress Meter"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dev = ["py-make (>=0.1.0)", "twine", "wheel"]
|
||||||
|
notebook = ["ipywidgets (>=6)"]
|
||||||
|
slack = ["slack-sdk"]
|
||||||
|
telegram = ["requests"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "urllib3"
|
name = "urllib3"
|
||||||
version = "1.26.12"
|
version = "1.26.12"
|
||||||
|
@ -112,7 +137,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "ca14c6295aaa4989085d6b4bd70b5e9d8299641e61f4e0b16d068d7c02a182bc"
|
content-hash = "bb58291e260d36ddcb0818ccf9b03cbe3e3bef25241fa3366fcfb18294720cf3"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
beautifulsoup4 = [
|
beautifulsoup4 = [
|
||||||
|
@ -127,6 +152,10 @@ charset-normalizer = [
|
||||||
{file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"},
|
{file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"},
|
||||||
{file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"},
|
{file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"},
|
||||||
]
|
]
|
||||||
|
colorama = [
|
||||||
|
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
|
||||||
|
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||||
|
]
|
||||||
idna = [
|
idna = [
|
||||||
{file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
|
{file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
|
||||||
{file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
|
{file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
|
||||||
|
@ -218,6 +247,10 @@ soupsieve = [
|
||||||
{file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"},
|
{file = "soupsieve-2.3.2.post1-py3-none-any.whl", hash = "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759"},
|
||||||
{file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"},
|
{file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"},
|
||||||
]
|
]
|
||||||
|
tqdm = [
|
||||||
|
{file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"},
|
||||||
|
{file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"},
|
||||||
|
]
|
||||||
urllib3 = [
|
urllib3 = [
|
||||||
{file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"},
|
{file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"},
|
||||||
{file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"},
|
{file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"},
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "scrape-ob"
|
name = "scrape-ob"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = ""
|
description = "Scrape Open Behavior repository and return structured data"
|
||||||
authors = ["sneakers-the-rat <JLSaunders987@gmail.com>"]
|
authors = ["sneakers-the-rat <JLSaunders987@gmail.com>"]
|
||||||
|
repository = "https://git.jon-e.net/jonny/scrape_ob"
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
packages = [{include = "scrape_ob"}]
|
packages = [{include = "scrape_ob"}]
|
||||||
|
@ -14,7 +15,10 @@ beautifulsoup4 = "^4.11.1"
|
||||||
lxml = "^4.9.1"
|
lxml = "^4.9.1"
|
||||||
parse = "^1.19.0"
|
parse = "^1.19.0"
|
||||||
pypandoc = "^1.9"
|
pypandoc = "^1.9"
|
||||||
|
tqdm = "^4.64.1"
|
||||||
|
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
scrape_ob = 'scrape_ob.main:main'
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
|
|
|
@ -1,3 +1,60 @@
|
||||||
"""
|
"""
|
||||||
User-facing runtime functions
|
User-facing runtime functions.
|
||||||
"""
|
|
||||||
|
We try and be kind by keeping and caching local copies of the HTML
|
||||||
|
instead of hitting the server every time, so we should always start with
|
||||||
|
the `download_all` function, which will skip any project files
|
||||||
|
that we already have downloaded in the output_folder
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from scrape_ob.scrape import download_all
|
||||||
|
from scrape_ob.constants import OB_ROOT
|
||||||
|
from scrape_ob.parse import Project, parse_folder
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def argparser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="scrape_ob",
|
||||||
|
description="Scrape Open Behavior and return structured data",
|
||||||
|
epilog="Be kind and always give credit where labor has been done"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-u', '--url', default=OB_ROOT, required=False,
|
||||||
|
help=f"Root URL for open behavior's open source project directory. Default is {OB_ROOT}"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o', '--output', default='./html', required=False, type=Path,
|
||||||
|
help="Output directory to store downloaded html files in. Default is ./html"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--download', action='store_true',
|
||||||
|
help="Just download html files without parsing them"
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> Optional[List[Project]]:
|
||||||
|
parser = argparser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
output = Path(args.output)
|
||||||
|
|
||||||
|
if args.download:
|
||||||
|
print('Just downloading files without parsing')
|
||||||
|
|
||||||
|
download_all(root_url=args.url, output_folder=output)
|
||||||
|
|
||||||
|
if args.download:
|
||||||
|
# just download, don't parse
|
||||||
|
return None
|
||||||
|
|
||||||
|
projects = parse_folder(path=output)
|
||||||
|
return projects
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,27 +1,73 @@
|
||||||
"""
|
"""
|
||||||
Models and parsers for OB pages
|
Models and parsers for OB pages
|
||||||
"""
|
"""
|
||||||
from typing import ClassVar, List, TypedDict
|
import re
|
||||||
|
from typing import ClassVar, List, TypedDict, Optional, Literal, TYPE_CHECKING, Union
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from pprint import pformat
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
|
|
||||||
from parse import parse as _parse
|
from parse import parse as _parse
|
||||||
|
from pypandoc import convert_text
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from scrape_ob.util import get_agent
|
from scrape_ob.util import get_agent
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Element:
|
|
||||||
selector:ClassVar[str]
|
|
||||||
soup:BeautifulSoup
|
|
||||||
|
|
||||||
|
BLURB_TYPES = Literal['docs', 'paper', 'repo', 'homepage']
|
||||||
@classmethod
|
|
||||||
def from_soup(cls, soup:BeautifulSoup) -> 'Element':
|
|
||||||
return cls(soup=soup.select(cls.selector))
|
|
||||||
|
|
||||||
class Category:
|
class Category:
|
||||||
selector = ".et_pb_module_header a"
|
selector = ".et_pb_module_header a"
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Blurb:
|
||||||
|
SELECTOR: ClassVar[str] = '.et_pb_blurb_content'
|
||||||
|
HEADER: ClassVar[str] = 'h4'
|
||||||
|
DESCRIPTION: ClassVar[str] = '.et_pb_blurb_description'
|
||||||
|
|
||||||
|
name: str
|
||||||
|
body: Tag
|
||||||
|
description: Tag
|
||||||
|
links: List[str]
|
||||||
|
type: Optional[BLURB_TYPES] = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_soup(cls, blurb) -> 'Blurb':
|
||||||
|
"""
|
||||||
|
From the bs4 Tag element that contains the blurb
|
||||||
|
"""
|
||||||
|
body = blurb
|
||||||
|
name = blurb.find(cls.HEADER).text
|
||||||
|
description = blurb.select(cls.DESCRIPTION)[0]
|
||||||
|
links = list(set([a.get('href') for a in blurb.find_all('a')]))
|
||||||
|
return Blurb(
|
||||||
|
body = body,
|
||||||
|
name = name,
|
||||||
|
description = description,
|
||||||
|
links = links
|
||||||
|
)
|
||||||
|
|
||||||
|
def dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
|
||||||
|
if str(field._field_type) != "_FIELD_CLASSVAR"
|
||||||
|
}
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
adict = self.dict()
|
||||||
|
del adict['body']
|
||||||
|
del adict['description']
|
||||||
|
return pformat(adict, indent=2)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Project:
|
class Project:
|
||||||
class selectors:
|
class selectors:
|
||||||
|
@ -29,20 +75,28 @@ class Project:
|
||||||
article = "article"
|
article = "article"
|
||||||
date = "span.published"
|
date = "span.published"
|
||||||
body = ".et_pb_text_inner"
|
body = ".et_pb_text_inner"
|
||||||
blurb = ".et_pb_blurb_content .et_pb_module_header a"
|
name = '.entry-title'
|
||||||
|
|
||||||
class patterns:
|
class patterns:
|
||||||
category = "project_category-{}"
|
category = "project_category-{}"
|
||||||
tag = "project_tag-{}"
|
tag = "project_tag-{}"
|
||||||
|
rrid = re.compile(r"RRID:\s{0,2}([\w\d_]*)")
|
||||||
|
|
||||||
DATE_FMT: ClassVar[str] = "%b %d, %Y"
|
DATE_FMT: ClassVar[str] = "%b %d, %Y"
|
||||||
"""eg. Oct 20, 2022"""
|
"""eg. Oct 20, 2022"""
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
url: str
|
url: str
|
||||||
|
body: str
|
||||||
date: datetime
|
date: datetime
|
||||||
tags: List[str]
|
tags: List[str]
|
||||||
categories: List[str]
|
categories: List[str]
|
||||||
|
rrids: List[str]
|
||||||
|
blurbs: List[Blurb]
|
||||||
|
docs: Optional[str] = None
|
||||||
|
repo: Optional[str] = None
|
||||||
|
paper: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]:
|
def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]:
|
||||||
|
@ -50,16 +104,92 @@ class Project:
|
||||||
return [t[0] for t in tags if t is not None]
|
return [t[0] for t in tags if t is not None]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_url(cls, url) -> 'Project':
|
def from_soup(cls, page:BeautifulSoup) -> 'Project':
|
||||||
page = BeautifulSoup(get_agent(url).content, 'lxml')
|
name = page.select(cls.selectors.name)[0].text
|
||||||
|
|
||||||
classes = page.select(cls.selectors.article)[0].get('class')
|
|
||||||
tags = cls.parse_pattern(classes, cls.patterns.tag)
|
|
||||||
categories = cls.parse_pattern(classes, cls.patterns.category)
|
|
||||||
date = datetime.strptime(
|
date = datetime.strptime(
|
||||||
page.select(cls.selectors.date)[0].text,
|
page.select(cls.selectors.date)[0].text,
|
||||||
cls.DATE_FMT
|
cls.DATE_FMT
|
||||||
)
|
)
|
||||||
|
url = page.find('link', rel="canonical").get('href')
|
||||||
|
|
||||||
|
# parse tag & category metadata
|
||||||
|
classes = page.select(cls.selectors.article)[0].get('class')
|
||||||
|
tags = cls.parse_pattern(classes, cls.patterns.tag)
|
||||||
|
categories = cls.parse_pattern(classes, cls.patterns.category)
|
||||||
|
|
||||||
|
# parse body text
|
||||||
body = page.select(cls.selectors.body)[0]
|
body = page.select(cls.selectors.body)[0]
|
||||||
# TODO Parse HTML to markdown
|
body_markdown = convert_text(body, to="commonmark-raw_html", format="html")
|
||||||
|
rrids = cls.patterns.rrid.findall(body_markdown)
|
||||||
|
|
||||||
|
# parse blurbs (links to documentation, etc.)
|
||||||
|
blurbs = [Blurb.from_soup(blurb) for blurb in page.select(Blurb.SELECTOR)]
|
||||||
|
|
||||||
|
return Project(
|
||||||
|
name = name,
|
||||||
|
date = date,
|
||||||
|
url = url,
|
||||||
|
body = body_markdown,
|
||||||
|
tags = tags,
|
||||||
|
categories = categories,
|
||||||
|
rrids = rrids,
|
||||||
|
blurbs = blurbs
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_url(cls, url:str) -> 'Project':
|
||||||
|
page = BeautifulSoup(get_agent(url).content, 'lxml')
|
||||||
|
return cls.from_soup(page)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_file(cls, path:Union[Path, str]) -> 'Project':
|
||||||
|
with open(path, 'r') as file:
|
||||||
|
html = file.read()
|
||||||
|
page = BeautifulSoup(html, 'lxml')
|
||||||
|
return cls.from_soup(page)
|
||||||
|
|
||||||
|
def dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
|
||||||
|
if str(field._field_type) != "_FIELD_CLASSVAR"
|
||||||
|
}
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
adict = self.dict()
|
||||||
|
del adict['body']
|
||||||
|
return pformat(adict, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_folder(path: Path=Path('./html'), verbose=True) -> List[Project]:
|
||||||
|
"""
|
||||||
|
Parse a directory of downloaded HTML files into
|
||||||
|
:class:`.Project` objects!
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (:class:`pathlib.Path`): Directory of downloaded html files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Project]: A list of project objects!
|
||||||
|
"""
|
||||||
|
path = Path(path)
|
||||||
|
html_files = list(path.glob('*.html'))
|
||||||
|
if verbose:
|
||||||
|
print('Parsing downloaded HTML')
|
||||||
|
pbar = tqdm(total=len(html_files))
|
||||||
|
else:
|
||||||
|
pbar = None
|
||||||
|
|
||||||
|
projects = []
|
||||||
|
for f in html_files:
|
||||||
|
try:
|
||||||
|
project = Project.from_file(f)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nException parsing {str(f)}, got {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
projects.append(project)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
pbar.update()
|
||||||
|
|
||||||
|
return projects
|
|
@ -1,12 +1,16 @@
|
||||||
"""
|
"""
|
||||||
Scraping and iteration code
|
Scraping and iteration code
|
||||||
"""
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from scrape_ob import OB_ROOT
|
from scrape_ob import OB_ROOT
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import List
|
from typing import List, Union
|
||||||
|
|
||||||
from scrape_ob.parse import Category, Project
|
from scrape_ob.parse import Project
|
||||||
from scrape_ob.util import get_agent
|
from scrape_ob.util import get_agent, project_name
|
||||||
|
|
||||||
|
|
||||||
def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
|
def get_root(root_url:str=OB_ROOT) -> BeautifulSoup:
|
||||||
|
@ -23,4 +27,62 @@ def list_projects(category_url:str) -> List[str]:
|
||||||
category = BeautifulSoup(category.content, 'lxml')
|
category = BeautifulSoup(category.content, 'lxml')
|
||||||
|
|
||||||
projects = category.select(Project.selectors.index)
|
projects = category.select(Project.selectors.index)
|
||||||
return [p.get('href') for p in projects]
|
return [p.get('href') for p in projects]
|
||||||
|
|
||||||
|
def list_all_projects(root_url:str=OB_ROOT, dedupe:bool=True) -> List[str]:
|
||||||
|
"""
|
||||||
|
Wraps the other scraping functions to return a list of all projects
|
||||||
|
in all categories as a flat list
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root_url: Root of the 'open source projects' page
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[str]: List of project page URLs
|
||||||
|
"""
|
||||||
|
root = get_root(root_url)
|
||||||
|
categories = list_category_urls(root)
|
||||||
|
projects = []
|
||||||
|
for category in categories:
|
||||||
|
projects.extend(list_projects(category))
|
||||||
|
if dedupe:
|
||||||
|
projects = list(set(projects))
|
||||||
|
return projects
|
||||||
|
|
||||||
|
|
||||||
|
def download_all(
|
||||||
|
root_url:str=OB_ROOT,
|
||||||
|
output_folder:Union[Path,str]=Path('./html'),
|
||||||
|
verbose:bool=True
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Download all OB pages, saving raw HTML to disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root_url (str): Root of the open source tools page
|
||||||
|
output_folder (:class:`pathlib.Path`): Directory to save HTML sources to
|
||||||
|
verbose (bool): Print status messages and progress bars
|
||||||
|
"""
|
||||||
|
output_folder = Path(output_folder)
|
||||||
|
output_folder.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# list all project urls
|
||||||
|
projects = list_all_projects(root_url)
|
||||||
|
|
||||||
|
# avoid downloading files that we already have
|
||||||
|
existing_files = [p.name for p in output_folder.glob("*.html")]
|
||||||
|
filtered_projects = [p for p in projects if project_name(p) + '.html' not in existing_files]
|
||||||
|
if verbose:
|
||||||
|
print(f'\nTotal projects: {len(projects)}\nExisting files: {len(existing_files)}\nDownloading files: {len(filtered_projects)}')
|
||||||
|
pbar = tqdm(total=len(filtered_projects))
|
||||||
|
else:
|
||||||
|
pbar = None
|
||||||
|
|
||||||
|
# Download everything to .html files!
|
||||||
|
for project in filtered_projects:
|
||||||
|
page = get_agent(project)
|
||||||
|
output_file = output_folder / (project_name(project) + '.html')
|
||||||
|
with open(output_file, 'wb') as output:
|
||||||
|
output.write(page.content)
|
||||||
|
if verbose:
|
||||||
|
pbar.update()
|
|
@ -1,4 +1,5 @@
|
||||||
from random import choice
|
from random import choice
|
||||||
|
from urllib.parse import urlsplit
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
@ -17,4 +18,22 @@ def get_agent(url, **kwargs) -> requests.Response:
|
||||||
url,
|
url,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
return gotten
|
return gotten
|
||||||
|
|
||||||
|
|
||||||
|
def project_name(url:str) -> str:
|
||||||
|
"""
|
||||||
|
Get the project name from the URL of an OB page
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
'https://edspace.american.edu/openbehavior/project/ethoscopes/'
|
||||||
|
becomes
|
||||||
|
'ethoscopes'
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL of project page
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str
|
||||||
|
"""
|
||||||
|
return urlsplit(url).path.strip('/').split('/')[-1]
|
Loading…
Reference in a new issue