openneuro-wiki/openneuro_wiki/scratch.py

58 lines
No EOL
1.5 KiB
Python

import pdb
import re
import frontmatter
import marko
from marko.block import Heading
from openneuro_wiki.parse import ON_Repo
import pandas as pd
def parse_all_files(files):
out = []
for file in files:
with open(file, 'r') as ofile:
text = ofile.read()
fm = frontmatter.loads(text)
md = marko.parse(fm.content)
out.append({'metadata':fm.metadata, 'content':md})
return out
def count_fields(parsed):
df = pd.DataFrame([p['metadata'] for p in parsed])
return df.notna().sum()
def unpack_children(block):
# pdb.set_trace()
if hasattr(block, 'children'):
if isinstance(block.children, str):
return block.children
else:
for child in block.children:
return unpack_children(child)
else:
return block
def unique_headers(parsed):
headers = []
for fulldoc in parsed:
doc = fulldoc['content']
for block in doc.children:
if isinstance(block, Heading):
headers.append({'header':unpack_children(block)})
return pd.DataFrame(headers)
def get_md_type(doc:marko.block.Document, get:str):
matches = []
if hasattr(doc, 'children'):
for child in doc.children:
if type(child).__name__ == get:
matches.append(child)
if hasattr(child, 'children'):
matches.extend(get_md_type(child, get))
return matches
pattern = re.compile(r"(?<=Project Author\(s\)\n).*", re.MULTILINE)
pattern.findall(md)