scrape_ob/scrape_ob/parse.py

195 lines
5.2 KiB
Python

"""
Models and parsers for OB pages
"""
import re
from typing import ClassVar, List, TypedDict, Optional, Literal, TYPE_CHECKING, Union
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from pprint import pformat
from bs4 import BeautifulSoup
from bs4.element import Tag
from parse import parse as _parse
from pypandoc import convert_text
from tqdm import tqdm
from scrape_ob.util import get_agent
BLURB_TYPES = Literal['docs', 'paper', 'repo', 'homepage']
class Category:
selector = ".et_pb_module_header a"
@dataclass
class Blurb:
SELECTOR: ClassVar[str] = '.et_pb_blurb_content'
HEADER: ClassVar[str] = 'h4'
DESCRIPTION: ClassVar[str] = '.et_pb_blurb_description'
name: str
body: Tag
description: Tag
links: List[str]
type: Optional[BLURB_TYPES] = None
@classmethod
def from_soup(cls, blurb) -> 'Blurb':
"""
From the bs4 Tag element that contains the blurb
"""
body = blurb
name = blurb.find(cls.HEADER).text
description = blurb.select(cls.DESCRIPTION)[0]
links = list(set([a.get('href') for a in blurb.find_all('a')]))
return Blurb(
body = body,
name = name,
description = description,
links = links
)
def dict(self) -> dict:
return {
k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
if str(field._field_type) != "_FIELD_CLASSVAR"
}
def __str__(self) -> str:
adict = self.dict()
del adict['body']
del adict['description']
return pformat(adict, indent=2)
def __repr__(self) -> str:
return self.__str__()
@dataclass
class Project:
class selectors:
index = ".et_pb_portfolio_item h2 a"
article = "article"
date = "span.published"
body = ".et_pb_text_inner"
name = '.entry-title'
class patterns:
category = "project_category-{}"
tag = "project_tag-{}"
rrid = re.compile(r"RRID:\s{0,2}([\w\d_]*)")
DATE_FMT: ClassVar[str] = "%b %d, %Y"
"""eg. Oct 20, 2022"""
name: str
url: str
body: str
date: datetime
tags: List[str]
categories: List[str]
rrids: List[str]
blurbs: List[Blurb]
docs: Optional[str] = None
repo: Optional[str] = None
paper: Optional[str] = None
@classmethod
def parse_pattern(cls, classes:List[str], pattern:str) -> List[str]:
tags = [_parse(pattern, c) for c in classes]
return [t[0] for t in tags if t is not None]
@classmethod
def from_soup(cls, page:BeautifulSoup) -> 'Project':
name = page.select(cls.selectors.name)[0].text
date = datetime.strptime(
page.select(cls.selectors.date)[0].text,
cls.DATE_FMT
)
url = page.find('link', rel="canonical").get('href')
# parse tag & category metadata
classes = page.select(cls.selectors.article)[0].get('class')
tags = cls.parse_pattern(classes, cls.patterns.tag)
categories = cls.parse_pattern(classes, cls.patterns.category)
# parse body text
body = page.select(cls.selectors.body)[0]
body_markdown = convert_text(body, to="commonmark-raw_html", format="html")
rrids = cls.patterns.rrid.findall(body_markdown)
# parse blurbs (links to documentation, etc.)
blurbs = [Blurb.from_soup(blurb) for blurb in page.select(Blurb.SELECTOR)]
return Project(
name = name,
date = date,
url = url,
body = body_markdown,
tags = tags,
categories = categories,
rrids = rrids,
blurbs = blurbs
)
@classmethod
def from_url(cls, url:str) -> 'Project':
page = BeautifulSoup(get_agent(url).content, 'lxml')
return cls.from_soup(page)
@classmethod
def from_file(cls, path:Union[Path, str]) -> 'Project':
with open(path, 'r') as file:
html = file.read()
page = BeautifulSoup(html, 'lxml')
return cls.from_soup(page)
def dict(self) -> dict:
return {
k:getattr(self,k) for k,field in self.__dataclass_fields__.items() \
if str(field._field_type) != "_FIELD_CLASSVAR"
}
def __str__(self) -> str:
adict = self.dict()
del adict['body']
return pformat(adict, indent=2)
def parse_folder(path: Path=Path('./html'), verbose=True) -> List[Project]:
"""
Parse a directory of downloaded HTML files into
:class:`.Project` objects!
Args:
path (:class:`pathlib.Path`): Directory of downloaded html files
Returns:
List[Project]: A list of project objects!
"""
path = Path(path)
html_files = list(path.glob('*.html'))
if verbose:
print('Parsing downloaded HTML')
pbar = tqdm(total=len(html_files))
else:
pbar = None
projects = []
for f in html_files:
try:
project = Project.from_file(f)
except Exception as e:
print(f"\nException parsing {str(f)}, got {e}")
continue
projects.append(project)
if verbose:
pbar.update()
return projects