initial and probably only

This commit is contained in:
sneakers-the-rat 2023-10-23 23:57:44 -07:00
parent fa0e5de3dd
commit 50df1cf7f2
3 changed files with 2420 additions and 0 deletions

2341
poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

21
pyproject.toml Normal file
View file

@ -0,0 +1,21 @@
[tool.poetry]
name = "scrape-dandi"
version = "0.1.0"
description = "scraping dandi"
authors = ["sneakers-the-rat <JLSaunders987@gmail.com>"]
readme = "README.md"
packages = [
{ include = "scrape_dandi" }
]
[tool.poetry.scripts]
scrape-dandi = "scrape_dandi:main"
[tool.poetry.dependencies]
python = "^3.11"
dandi = "^0.56.2"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

58
scrape_dandi/__init__.py Normal file
View file

@ -0,0 +1,58 @@
from pathlib import Path
from tqdm import trange
from datetime import datetime
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from dandi.cli.cmd_download import download
#OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
OUT_DIR = '.'
LOG_TXT = 'log.txt'
SKIP_DANDISETS = [
'000108' # humongous 372 human light sheet imaging
]
DANDI_ID = 'DANDI:{:06d}'
MAX_DANDISET = 683
def check_nwb(dandiset:int) -> bool:
if dandiset == 108:
return False
id = DANDI_ID.format(dandiset)
try:
url = parse_dandi_url(id)
with url.navigate(strict=True) as (c, dandiset,assets):
is_nwb = any([a.path.endswith('nwb') for a in assets])
is_not_draft = dandiset.version.identifier != 'draft'
return is_nwb and is_not_draft
except:
return False
def main():
for i in trange(MAX_DANDISET):
if not check_nwb(i):
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
continue
id = DANDI_ID.format(i)
download(
[
id,
'-o', str(OUT_DIR),
'--existing', 'refresh',
'--jobs', '24'
]
)
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")