58 lines
1.5 KiB
Python
58 lines
1.5 KiB
Python
|
import pdb
|
||
|
import re
|
||
|
|
||
|
import frontmatter
|
||
|
import marko
|
||
|
from marko.block import Heading
|
||
|
|
||
|
from openneuro_wiki.parse import ON_Repo
|
||
|
import pandas as pd
|
||
|
|
||
|
def parse_all_files(files):
|
||
|
out = []
|
||
|
for file in files:
|
||
|
with open(file, 'r') as ofile:
|
||
|
text = ofile.read()
|
||
|
fm = frontmatter.loads(text)
|
||
|
md = marko.parse(fm.content)
|
||
|
out.append({'metadata':fm.metadata, 'content':md})
|
||
|
return out
|
||
|
|
||
|
def count_fields(parsed):
|
||
|
df = pd.DataFrame([p['metadata'] for p in parsed])
|
||
|
return df.notna().sum()
|
||
|
|
||
|
def unpack_children(block):
|
||
|
# pdb.set_trace()
|
||
|
if hasattr(block, 'children'):
|
||
|
if isinstance(block.children, str):
|
||
|
return block.children
|
||
|
else:
|
||
|
for child in block.children:
|
||
|
return unpack_children(child)
|
||
|
else:
|
||
|
return block
|
||
|
|
||
|
def unique_headers(parsed):
|
||
|
headers = []
|
||
|
for fulldoc in parsed:
|
||
|
doc = fulldoc['content']
|
||
|
for block in doc.children:
|
||
|
if isinstance(block, Heading):
|
||
|
headers.append({'header':unpack_children(block)})
|
||
|
|
||
|
return pd.DataFrame(headers)
|
||
|
|
||
|
def get_md_type(doc:marko.block.Document, get:str):
|
||
|
matches = []
|
||
|
if hasattr(doc, 'children'):
|
||
|
for child in doc.children:
|
||
|
if type(child).__name__ == get:
|
||
|
matches.append(child)
|
||
|
if hasattr(child, 'children'):
|
||
|
matches.extend(get_md_type(child, get))
|
||
|
return matches
|
||
|
|
||
|
|
||
|
pattern = re.compile(r"(?<=Project Author\(s\)\n).*", re.MULTILINE)
|
||
|
pattern.findall(md)
|