scrape-dandi/scrape_dandi/__init__.py

69 lines
1.9 KiB
Python
Raw Normal View History

2023-10-24 06:57:44 +00:00
from pathlib import Path
from tqdm import trange
from datetime import datetime
2023-10-27 03:11:01 +00:00
import sys
import click
2023-10-24 06:57:44 +00:00
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from dandi.cli.cmd_download import download
2023-10-24 06:58:07 +00:00
OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
#OUT_DIR = '.'
2023-10-24 06:57:44 +00:00
LOG_TXT = 'log.txt'
SKIP_DANDISETS = [
2023-10-24 07:08:45 +00:00
108 # humongous 372 human light sheet imaging
2023-10-24 06:57:44 +00:00
]
DANDI_ID = 'DANDI:{:06d}'
MAX_DANDISET = 683
2023-10-24 07:08:45 +00:00
JOBS = 64
2023-10-24 06:57:44 +00:00
def check_nwb(dandiset:int) -> bool:
if dandiset == 108:
return False
id = DANDI_ID.format(dandiset)
try:
url = parse_dandi_url(id)
with url.navigate(strict=True) as (c, dandiset,assets):
is_nwb = any([a.path.endswith('nwb') for a in assets])
is_not_draft = dandiset.version.identifier != 'draft'
return is_nwb and is_not_draft
except:
return False
def main():
for i in trange(MAX_DANDISET):
2023-10-24 07:08:45 +00:00
if not check_nwb(i) or i in SKIP_DANDISETS:
2023-10-24 06:57:44 +00:00
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
continue
id = DANDI_ID.format(i)
2023-10-24 07:08:45 +00:00
try:
2023-10-27 03:11:01 +00:00
download.main([
2023-10-24 07:08:45 +00:00
id,
'-o', str(OUT_DIR),
'--existing', 'refresh',
'--jobs', str(JOBS)
2023-10-27 03:11:01 +00:00
], standalone_mode=False
2023-10-24 07:08:45 +00:00
)
2023-10-27 03:11:01 +00:00
print('\nafterdl')
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
except KeyboardInterrupt:
sys.exit(1)
2023-10-24 07:08:45 +00:00
except Exception as e:
2023-10-27 03:11:01 +00:00
print('\nexception')
2023-10-24 07:08:45 +00:00
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
lfile.write(str(e))
2023-10-27 03:11:01 +00:00
# continue
2023-10-24 06:57:44 +00:00