Skip embargoed dandisets, start at nonzero dandiset

This commit is contained in:
sneakers-the-rat 2023-10-30 12:41:16 -07:00
parent e89e8cf913
commit 17b580638f

View file

@ -2,10 +2,15 @@ from pathlib import Path
from tqdm import trange from tqdm import trange
from datetime import datetime from datetime import datetime
import sys import sys
from typing import Optional
import click import click
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url, ParsedDandiURL
from dandi.cli.cmd_download import download from dandi.cli.cmd_download import download
from enum import StrEnum
from dataclasses import dataclass
import requests
import argparse
OUT_DIR = Path('/mnt/seedbank/p2p/dandi/') OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
@ -18,6 +23,17 @@ DANDI_ID = 'DANDI:{:06d}'
MAX_DANDISET = 683 MAX_DANDISET = 683
JOBS = 64 JOBS = 64
class DownloadResultType(StrEnum):
SKIP = 'SKIP'
GET = 'GET'
ERROR = 'ERROR'
@dataclass
class DownloadResult:
id: str
type: DownloadResultType
err: Optional[Exception] = None
def check_nwb(dandiset:int) -> bool: def check_nwb(dandiset:int) -> bool:
if dandiset == 108: if dandiset == 108:
@ -26,6 +42,8 @@ def check_nwb(dandiset:int) -> bool:
id = DANDI_ID.format(dandiset) id = DANDI_ID.format(dandiset)
try: try:
url = parse_dandi_url(id) url = parse_dandi_url(id)
if not check_auth(url):
return False
with url.navigate(strict=True) as (c, dandiset,assets): with url.navigate(strict=True) as (c, dandiset,assets):
is_nwb = any([a.path.endswith('nwb') for a in assets]) is_nwb = any([a.path.endswith('nwb') for a in assets])
is_not_draft = dandiset.version.identifier != 'draft' is_not_draft = dandiset.version.identifier != 'draft'
@ -33,14 +51,24 @@ def check_nwb(dandiset:int) -> bool:
except: except:
return False return False
def check_auth(url:ParsedDandiURL) -> bool:
with url.get_client() as client:
try:
dandiset = url.get_dandiset(client, lazy=False)
except requests.HTTPError as e:
if e.response.status_code == 401:
return False
else:
raise
return True
def main(): def download_dandiset(i:int) -> DownloadResult:
id = DANDI_ID.format(i)
for i in trange(MAX_DANDISET):
if not check_nwb(i) or i in SKIP_DANDISETS: if not check_nwb(i) or i in SKIP_DANDISETS:
with open(LOG_TXT, 'a') as lfile: return DownloadResult(
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n") id=id,
continue type=DownloadResultType.SKIP
)
id = DANDI_ID.format(i) id = DANDI_ID.format(i)
try: try:
download.main([ download.main([
@ -51,17 +79,49 @@ def main():
'--jobs', str(JOBS) '--jobs', str(JOBS)
], standalone_mode=False ], standalone_mode=False
) )
print('\nafterdl')
except Exception as e:
return DownloadResult(
id=id,
type=DownloadResultType.ERROR,
err=e
)
return DownloadResult(
id=id,
type=DownloadResultType.GET
)
# continue
def argparser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument(
'-s', '--start',
description="Dandiset ID (int) to start scraping (default 1)",
default=1,
type=int
)
return parser
def main():
args = argparser().parse_args()
for i in trange(args.start, MAX_DANDISET):
try:
res = download_dandiset(i)
if res.type == DownloadResultType.SKIP:
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
elif res.type == DownloadResultType.ERROR:
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
lfile.write(str(res.err))
elif res.type == DownloadResultType.GET:
with open(LOG_TXT, 'a') as lfile: with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n") lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
except KeyboardInterrupt: except KeyboardInterrupt:
sys.exit(1) break
except Exception as e:
print('\nexception')
with open(LOG_TXT, 'a') as lfile:
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
lfile.write(str(e))
# continue