Skip embargoed dandisets, start at nonzero dandiset

This commit is contained in:
sneakers-the-rat 2023-10-30 12:41:16 -07:00
parent e89e8cf913
commit 17b580638f

View file

@ -2,10 +2,15 @@ from pathlib import Path
from tqdm import trange
from datetime import datetime
import sys
from typing import Optional
import click
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url, ParsedDandiURL
from dandi.cli.cmd_download import download
from enum import StrEnum
from dataclasses import dataclass
import requests
import argparse
OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
@ -18,6 +23,17 @@ DANDI_ID = 'DANDI:{:06d}'
MAX_DANDISET = 683
JOBS = 64
class DownloadResultType(StrEnum):
SKIP = 'SKIP'
GET = 'GET'
ERROR = 'ERROR'
@dataclass
class DownloadResult:
id: str
type: DownloadResultType
err: Optional[Exception] = None
def check_nwb(dandiset:int) -> bool:
if dandiset == 108:
@ -26,6 +42,8 @@ def check_nwb(dandiset:int) -> bool:
id = DANDI_ID.format(dandiset)
try:
url = parse_dandi_url(id)
if not check_auth(url):
return False
with url.navigate(strict=True) as (c, dandiset,assets):
is_nwb = any([a.path.endswith('nwb') for a in assets])
is_not_draft = dandiset.version.identifier != 'draft'
@ -33,35 +51,77 @@ def check_nwb(dandiset:int) -> bool:
except:
return False
def check_auth(url:ParsedDandiURL) -> bool:
with url.get_client() as client:
try:
dandiset = url.get_dandiset(client, lazy=False)
except requests.HTTPError as e:
if e.response.status_code == 401:
return False
else:
raise
return True
def download_dandiset(i:int) -> DownloadResult:
id = DANDI_ID.format(i)
if not check_nwb(i) or i in SKIP_DANDISETS:
return DownloadResult(
id=id,
type=DownloadResultType.SKIP
)
id = DANDI_ID.format(i)
try:
download.main([
id,
'-o', str(OUT_DIR),
'--existing', 'refresh',
'--jobs', str(JOBS)
], standalone_mode=False
)
except Exception as e:
return DownloadResult(
id=id,
type=DownloadResultType.ERROR,
err=e
)
return DownloadResult(
id=id,
type=DownloadResultType.GET
)
# continue
def argparser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument(
'-s', '--start',
description="Dandiset ID (int) to start scraping (default 1)",
default=1,
type=int
)
return parser
def main():
for i in trange(MAX_DANDISET):
if not check_nwb(i) or i in SKIP_DANDISETS:
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
continue
id = DANDI_ID.format(i)
args = argparser().parse_args()
for i in trange(args.start, MAX_DANDISET):
try:
download.main([
id,
'-o', str(OUT_DIR),
'--existing', 'refresh',
'--jobs', str(JOBS)
], standalone_mode=False
)
print('\nafterdl')
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
res = download_dandiset(i)
if res.type == DownloadResultType.SKIP:
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
elif res.type == DownloadResultType.ERROR:
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
lfile.write(str(res.err))
elif res.type == DownloadResultType.GET:
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
except KeyboardInterrupt:
sys.exit(1)
except Exception as e:
print('\nexception')
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
lfile.write(str(e))
# continue
break