from pathlib import Path from tqdm import trange from datetime import datetime import sys from typing import Optional import click from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url, ParsedDandiURL from dandi.cli.cmd_download import download from enum import StrEnum from dataclasses import dataclass import requests import argparse OUT_DIR = Path('/mnt/seedbank/p2p/dandi/') #OUT_DIR = '.' LOG_TXT = 'log.txt' SKIP_DANDISETS = [ 108 # humongous 372 human light sheet imaging ] DANDI_ID = 'DANDI:{:06d}' MAX_DANDISET = 683 JOBS = 64 class DownloadResultType(StrEnum): SKIP = 'SKIP' GET = 'GET' ERROR = 'ERROR' @dataclass class DownloadResult: id: str type: DownloadResultType err: Optional[Exception] = None def check_nwb(dandiset:int) -> bool: if dandiset == 108: return False id = DANDI_ID.format(dandiset) try: url = parse_dandi_url(id) if not check_auth(url): return False with url.navigate(strict=True) as (c, dandiset,assets): is_nwb = any([a.path.endswith('nwb') for a in assets]) is_not_draft = dandiset.version.identifier != 'draft' return is_nwb and is_not_draft except: return False def check_auth(url:ParsedDandiURL) -> bool: with url.get_client() as client: try: dandiset = url.get_dandiset(client, lazy=False) except requests.HTTPError as e: if e.response.status_code == 401: return False else: raise return True def download_dandiset(i:int) -> DownloadResult: id = DANDI_ID.format(i) if not check_nwb(i) or i in SKIP_DANDISETS: return DownloadResult( id=id, type=DownloadResultType.SKIP ) id = DANDI_ID.format(i) try: download.main([ id, '-o', str(OUT_DIR), '--existing', 'refresh', '--jobs', str(JOBS) ], standalone_mode=False ) except Exception as e: return DownloadResult( id=id, type=DownloadResultType.ERROR, err=e ) return DownloadResult( id=id, type=DownloadResultType.GET ) # continue def argparser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument( '-s', '--start', description="Dandiset ID (int) to start scraping (default 1)", default=1, type=int ) return parser def main(): args = argparser().parse_args() for i in trange(args.start, MAX_DANDISET): try: res = download_dandiset(i) if res.type == DownloadResultType.SKIP: with open(LOG_TXT, 'a') as lfile: lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n") elif res.type == DownloadResultType.ERROR: with open(LOG_TXT, 'a') as lfile: lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n") lfile.write(str(res.err)) elif res.type == DownloadResultType.GET: with open(LOG_TXT, 'a') as lfile: lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n") except KeyboardInterrupt: break