195 lines
5.2 KiB
Python
195 lines
5.2 KiB
Python
"""
|
|
Models and parsers for OB pages
|
|
"""
|
|
import re
|
|
from typing import ClassVar, List, TypedDict, Optional, Literal, TYPE_CHECKING, Union
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from pprint import pformat
|
|
|
|
from bs4 import BeautifulSoup
|
|
from bs4.element import Tag
|
|
|
|
from parse import parse as _parse
|
|
from pypandoc import convert_text
|
|
from tqdm import tqdm
|
|
|
|
from scrape_ob.util import get_agent
|
|
|
|
|
|
BLURB_TYPES = Literal['docs', 'paper', 'repo', 'homepage']
|
|
|
|
class Category:
|
|
selector = ".et_pb_module_header a"
|
|
|
|
@dataclass
|
|
class Blurb:
|
|
SELECTOR: ClassVar[str] = '.et_pb_blurb_content'
|
|
HEADER: ClassVar[str] = 'h4'
|
|
DESCRIPTION: ClassVar[str] = '.et_pb_blurb_description'
|
|
|
|
name: str
|
|
body: Tag
|
|
description: Tag
|
|
links: List[str]
|
|
type: Optional[BLURB_TYPES] = None
|
|
|
|
@classmethod
|
|
def from_soup(cls, blurb) -> 'Blurb':
|
|
"""
|
|
From the bs4 Tag element that contains the blurb
|
|
"""
|
|
body = blurb
|
|
name = blurb.find(cls.HEADER).text
|
|
description = blurb.select(cls.DESCRIPTION)[0]
|
|
links = list(set([a.get('href') for a in blurb.find_all('a')]))
|
|
return Blurb(
|
|
body = body,
|
|
name = name,
|
|
description = description,
|
|
links = links
|
|
)
|
|
|
|
def dict(self) -> dict:
|
|
return {
|
|
k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
|
|
if str(field._field_type) != "_FIELD_CLASSVAR"
|
|
}
|
|
|
|
def __str__(self) -> str:
|
|
adict = self.dict()
|
|
del adict['body']
|
|
del adict['description']
|
|
return pformat(adict, indent=2)
|
|
|
|
def __repr__(self) -> str:
|
|
return self.__str__()
|
|
|
|
|
|
|
|
@dataclass
|
|
class Project:
|
|
class selectors:
|
|
index = ".et_pb_portfolio_item h2 a"
|
|
article = "article"
|
|
date = "span.published"
|
|
body = ".et_pb_text_inner"
|
|
name = '.entry-title'
|
|
|
|
class patterns:
|
|
category = "project_category-{}"
|
|
tag = "project_tag-{}"
|
|
rrid = re.compile(r"RRID:\s{0,2}([\w\d_]*)")
|
|
|
|
DATE_FMT: ClassVar[str] = "%b %d, %Y"
|
|
"""eg. Oct 20, 2022"""
|
|
|
|
name: str
|
|
url: str
|
|
body: str
|
|
date: datetime
|
|
tags: List[str]
|
|
categories: List[str]
|
|
rrids: List[str]
|
|
blurbs: List[Blurb]
|
|
docs: Optional[str] = None
|
|
repo: Optional[str] = None
|
|
paper: Optional[str] = None
|
|
|
|
|
|
@classmethod
|
|
def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]:
|
|
tags = [_parse(pattern, c) for c in classes]
|
|
return [t[0] for t in tags if t is not None]
|
|
|
|
@classmethod
|
|
def from_soup(cls, page:BeautifulSoup) -> 'Project':
|
|
name = page.select(cls.selectors.name)[0].text
|
|
date = datetime.strptime(
|
|
page.select(cls.selectors.date)[0].text,
|
|
cls.DATE_FMT
|
|
)
|
|
url = page.find('link', rel="canonical").get('href')
|
|
|
|
# parse tag & category metadata
|
|
classes = page.select(cls.selectors.article)[0].get('class')
|
|
tags = cls.parse_pattern(classes, cls.patterns.tag)
|
|
categories = cls.parse_pattern(classes, cls.patterns.category)
|
|
|
|
# parse body text
|
|
body = page.select(cls.selectors.body)[0]
|
|
body_markdown = convert_text(body, to="commonmark-raw_html", format="html")
|
|
rrids = cls.patterns.rrid.findall(body_markdown)
|
|
|
|
# parse blurbs (links to documentation, etc.)
|
|
blurbs = [Blurb.from_soup(blurb) for blurb in page.select(Blurb.SELECTOR)]
|
|
|
|
return Project(
|
|
name = name,
|
|
date = date,
|
|
url = url,
|
|
body = body_markdown,
|
|
tags = tags,
|
|
categories = categories,
|
|
rrids = rrids,
|
|
blurbs = blurbs
|
|
)
|
|
|
|
@classmethod
|
|
def from_url(cls, url:str) -> 'Project':
|
|
page = BeautifulSoup(get_agent(url).content, 'lxml')
|
|
return cls.from_soup(page)
|
|
|
|
@classmethod
|
|
def from_file(cls, path:Union[Path, str]) -> 'Project':
|
|
with open(path, 'r') as file:
|
|
html = file.read()
|
|
page = BeautifulSoup(html, 'lxml')
|
|
return cls.from_soup(page)
|
|
|
|
def dict(self) -> dict:
|
|
return {
|
|
k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
|
|
if str(field._field_type) != "_FIELD_CLASSVAR"
|
|
}
|
|
|
|
def __str__(self) -> str:
|
|
adict = self.dict()
|
|
del adict['body']
|
|
return pformat(adict, indent=2)
|
|
|
|
|
|
def parse_folder(path: Path=Path('./html'), verbose=True) -> List[Project]:
|
|
"""
|
|
Parse a directory of downloaded HTML files into
|
|
:class:`.Project` objects!
|
|
|
|
Args:
|
|
path (:class:`pathlib.Path`): Directory of downloaded html files
|
|
|
|
Returns:
|
|
List[Project]: A list of project objects!
|
|
"""
|
|
path = Path(path)
|
|
html_files = list(path.glob('*.html'))
|
|
if verbose:
|
|
print('Parsing downloaded HTML')
|
|
pbar = tqdm(total=len(html_files))
|
|
else:
|
|
pbar = None
|
|
|
|
projects = []
|
|
for f in html_files:
|
|
try:
|
|
project = Project.from_file(f)
|
|
except Exception as e:
|
|
print(f"\nException parsing {str(f)}, got {e}")
|
|
continue
|
|
|
|
projects.append(project)
|
|
|
|
if verbose:
|
|
pbar.update()
|
|
|
|
return projects |