58 lines
No EOL
1.5 KiB
Python
58 lines
No EOL
1.5 KiB
Python
import pdb
|
|
import re
|
|
|
|
import frontmatter
|
|
import marko
|
|
from marko.block import Heading
|
|
|
|
from openneuro_wiki.parse import ON_Repo
|
|
import pandas as pd
|
|
|
|
def parse_all_files(files):
|
|
out = []
|
|
for file in files:
|
|
with open(file, 'r') as ofile:
|
|
text = ofile.read()
|
|
fm = frontmatter.loads(text)
|
|
md = marko.parse(fm.content)
|
|
out.append({'metadata':fm.metadata, 'content':md})
|
|
return out
|
|
|
|
def count_fields(parsed):
|
|
df = pd.DataFrame([p['metadata'] for p in parsed])
|
|
return df.notna().sum()
|
|
|
|
def unpack_children(block):
|
|
# pdb.set_trace()
|
|
if hasattr(block, 'children'):
|
|
if isinstance(block.children, str):
|
|
return block.children
|
|
else:
|
|
for child in block.children:
|
|
return unpack_children(child)
|
|
else:
|
|
return block
|
|
|
|
def unique_headers(parsed):
|
|
headers = []
|
|
for fulldoc in parsed:
|
|
doc = fulldoc['content']
|
|
for block in doc.children:
|
|
if isinstance(block, Heading):
|
|
headers.append({'header':unpack_children(block)})
|
|
|
|
return pd.DataFrame(headers)
|
|
|
|
def get_md_type(doc:marko.block.Document, get:str):
|
|
matches = []
|
|
if hasattr(doc, 'children'):
|
|
for child in doc.children:
|
|
if type(child).__name__ == get:
|
|
matches.append(child)
|
|
if hasattr(child, 'children'):
|
|
matches.extend(get_md_type(child, get))
|
|
return matches
|
|
|
|
|
|
pattern = re.compile(r"(?<=Project Author\(s\)\n).*", re.MULTILINE)
|
|
pattern.findall(md) |