From 17b580638f46fc68cae7be1714fe718f8aa5f0bf Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Mon, 30 Oct 2023 12:41:16 -0700 Subject: [PATCH] Skip embargoed dandisets, start at nonzero dandiset --- scrape_dandi/__init__.py | 112 ++++++++++++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 26 deletions(-) diff --git a/scrape_dandi/__init__.py b/scrape_dandi/__init__.py index d930b04..e96e214 100644 --- a/scrape_dandi/__init__.py +++ b/scrape_dandi/__init__.py @@ -2,10 +2,15 @@ from pathlib import Path from tqdm import trange from datetime import datetime import sys +from typing import Optional import click from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields -from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url +from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url, ParsedDandiURL from dandi.cli.cmd_download import download +from enum import StrEnum +from dataclasses import dataclass +import requests +import argparse OUT_DIR = Path('/mnt/seedbank/p2p/dandi/') @@ -18,6 +23,17 @@ DANDI_ID = 'DANDI:{:06d}' MAX_DANDISET = 683 JOBS = 64 +class DownloadResultType(StrEnum): + SKIP = 'SKIP' + GET = 'GET' + ERROR = 'ERROR' + +@dataclass +class DownloadResult: + id: str + type: DownloadResultType + err: Optional[Exception] = None + def check_nwb(dandiset:int) -> bool: if dandiset == 108: @@ -26,6 +42,8 @@ def check_nwb(dandiset:int) -> bool: id = DANDI_ID.format(dandiset) try: url = parse_dandi_url(id) + if not check_auth(url): + return False with url.navigate(strict=True) as (c, dandiset,assets): is_nwb = any([a.path.endswith('nwb') for a in assets]) is_not_draft = dandiset.version.identifier != 'draft' @@ -33,35 +51,77 @@ def check_nwb(dandiset:int) -> bool: except: return False +def check_auth(url:ParsedDandiURL) -> bool: + with url.get_client() as client: + try: + dandiset = url.get_dandiset(client, lazy=False) + except requests.HTTPError as e: + if e.response.status_code == 401: + return False + else: + raise + return True + +def download_dandiset(i:int) -> DownloadResult: + id = DANDI_ID.format(i) + if not check_nwb(i) or i in SKIP_DANDISETS: + return DownloadResult( + id=id, + type=DownloadResultType.SKIP + ) + id = DANDI_ID.format(i) + try: + download.main([ + + id, + '-o', str(OUT_DIR), + '--existing', 'refresh', + '--jobs', str(JOBS) + ], standalone_mode=False + ) + + except Exception as e: + return DownloadResult( + id=id, + type=DownloadResultType.ERROR, + err=e + ) + + return DownloadResult( + id=id, + type=DownloadResultType.GET + ) + # continue + +def argparser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + parser.add_argument( + '-s', '--start', + description="Dandiset ID (int) to start scraping (default 1)", + default=1, + type=int + ) + return parser def main(): - - for i in trange(MAX_DANDISET): - if not check_nwb(i) or i in SKIP_DANDISETS: - with open(LOG_TXT, 'a') as lfile: - lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n") - continue - id = DANDI_ID.format(i) + args = argparser().parse_args() + + for i in trange(args.start, MAX_DANDISET): try: - download.main([ - - id, - '-o', str(OUT_DIR), - '--existing', 'refresh', - '--jobs', str(JOBS) - ], standalone_mode=False - ) - print('\nafterdl') - with open(LOG_TXT, 'a') as lfile: - lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n") + res = download_dandiset(i) + if res.type == DownloadResultType.SKIP: + with open(LOG_TXT, 'a') as lfile: + lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n") + elif res.type == DownloadResultType.ERROR: + with open(LOG_TXT, 'a') as lfile: + lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n") + lfile.write(str(res.err)) + elif res.type == DownloadResultType.GET: + with open(LOG_TXT, 'a') as lfile: + lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n") except KeyboardInterrupt: - sys.exit(1) - except Exception as e: - print('\nexception') - with open(LOG_TXT, 'a') as lfile: - lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n") - lfile.write(str(e)) - # continue + break +