initial and probably only
This commit is contained in:
parent
fa0e5de3dd
commit
50df1cf7f2
3 changed files with 2420 additions and 0 deletions
2341
poetry.lock
generated
Normal file
2341
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
21
pyproject.toml
Normal file
21
pyproject.toml
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "scrape-dandi"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "scraping dandi"
|
||||||
|
authors = ["sneakers-the-rat <JLSaunders987@gmail.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
packages = [
|
||||||
|
{ include = "scrape_dandi" }
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
scrape-dandi = "scrape_dandi:main"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
dandi = "^0.56.2"
|
||||||
|
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
58
scrape_dandi/__init__.py
Normal file
58
scrape_dandi/__init__.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import trange
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
|
||||||
|
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
|
||||||
|
from dandi.cli.cmd_download import download
|
||||||
|
|
||||||
|
|
||||||
|
#OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
|
||||||
|
OUT_DIR = '.'
|
||||||
|
LOG_TXT = 'log.txt'
|
||||||
|
SKIP_DANDISETS = [
|
||||||
|
'000108' # humongous 372 human light sheet imaging
|
||||||
|
]
|
||||||
|
DANDI_ID = 'DANDI:{:06d}'
|
||||||
|
MAX_DANDISET = 683
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def check_nwb(dandiset:int) -> bool:
|
||||||
|
if dandiset == 108:
|
||||||
|
return False
|
||||||
|
|
||||||
|
id = DANDI_ID.format(dandiset)
|
||||||
|
try:
|
||||||
|
url = parse_dandi_url(id)
|
||||||
|
with url.navigate(strict=True) as (c, dandiset,assets):
|
||||||
|
is_nwb = any([a.path.endswith('nwb') for a in assets])
|
||||||
|
is_not_draft = dandiset.version.identifier != 'draft'
|
||||||
|
return is_nwb and is_not_draft
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
for i in trange(MAX_DANDISET):
|
||||||
|
|
||||||
|
if not check_nwb(i):
|
||||||
|
with open(LOG_TXT, 'a') as lfile:
|
||||||
|
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
|
||||||
|
continue
|
||||||
|
id = DANDI_ID.format(i)
|
||||||
|
download(
|
||||||
|
[
|
||||||
|
id,
|
||||||
|
'-o', str(OUT_DIR),
|
||||||
|
'--existing', 'refresh',
|
||||||
|
'--jobs', '24'
|
||||||
|
]
|
||||||
|
)
|
||||||
|
with open(LOG_TXT, 'a') as lfile:
|
||||||
|
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue