import pdb import re import frontmatter import marko from marko.block import Heading from openneuro_wiki.parse import ON_Repo import pandas as pd def parse_all_files(files): out = [] for file in files: with open(file, 'r') as ofile: text = ofile.read() fm = frontmatter.loads(text) md = marko.parse(fm.content) out.append({'metadata':fm.metadata, 'content':md}) return out def count_fields(parsed): df = pd.DataFrame([p['metadata'] for p in parsed]) return df.notna().sum() def unpack_children(block): # pdb.set_trace() if hasattr(block, 'children'): if isinstance(block.children, str): return block.children else: for child in block.children: return unpack_children(child) else: return block def unique_headers(parsed): headers = [] for fulldoc in parsed: doc = fulldoc['content'] for block in doc.children: if isinstance(block, Heading): headers.append({'header':unpack_children(block)}) return pd.DataFrame(headers) def get_md_type(doc:marko.block.Document, get:str): matches = [] if hasattr(doc, 'children'): for child in doc.children: if type(child).__name__ == get: matches.append(child) if hasattr(child, 'children'): matches.extend(get_md_type(child, get)) return matches pattern = re.compile(r"(?<=Project Author\(s\)\n).*", re.MULTILINE) pattern.findall(md)