Skip embargoed dandisets, start at nonzero dandiset
This commit is contained in:
parent
e89e8cf913
commit
17b580638f
1 changed files with 86 additions and 26 deletions
|
@ -2,10 +2,15 @@ from pathlib import Path
|
||||||
from tqdm import trange
|
from tqdm import trange
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Optional
|
||||||
import click
|
import click
|
||||||
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
|
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
|
||||||
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
|
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url, ParsedDandiURL
|
||||||
from dandi.cli.cmd_download import download
|
from dandi.cli.cmd_download import download
|
||||||
|
from enum import StrEnum
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import requests
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
|
OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
|
||||||
|
@ -18,6 +23,17 @@ DANDI_ID = 'DANDI:{:06d}'
|
||||||
MAX_DANDISET = 683
|
MAX_DANDISET = 683
|
||||||
JOBS = 64
|
JOBS = 64
|
||||||
|
|
||||||
|
class DownloadResultType(StrEnum):
|
||||||
|
SKIP = 'SKIP'
|
||||||
|
GET = 'GET'
|
||||||
|
ERROR = 'ERROR'
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DownloadResult:
|
||||||
|
id: str
|
||||||
|
type: DownloadResultType
|
||||||
|
err: Optional[Exception] = None
|
||||||
|
|
||||||
|
|
||||||
def check_nwb(dandiset:int) -> bool:
|
def check_nwb(dandiset:int) -> bool:
|
||||||
if dandiset == 108:
|
if dandiset == 108:
|
||||||
|
@ -26,6 +42,8 @@ def check_nwb(dandiset:int) -> bool:
|
||||||
id = DANDI_ID.format(dandiset)
|
id = DANDI_ID.format(dandiset)
|
||||||
try:
|
try:
|
||||||
url = parse_dandi_url(id)
|
url = parse_dandi_url(id)
|
||||||
|
if not check_auth(url):
|
||||||
|
return False
|
||||||
with url.navigate(strict=True) as (c, dandiset,assets):
|
with url.navigate(strict=True) as (c, dandiset,assets):
|
||||||
is_nwb = any([a.path.endswith('nwb') for a in assets])
|
is_nwb = any([a.path.endswith('nwb') for a in assets])
|
||||||
is_not_draft = dandiset.version.identifier != 'draft'
|
is_not_draft = dandiset.version.identifier != 'draft'
|
||||||
|
@ -33,35 +51,77 @@ def check_nwb(dandiset:int) -> bool:
|
||||||
except:
|
except:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def check_auth(url:ParsedDandiURL) -> bool:
|
||||||
|
with url.get_client() as client:
|
||||||
|
try:
|
||||||
|
dandiset = url.get_dandiset(client, lazy=False)
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
if e.response.status_code == 401:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
return True
|
||||||
|
|
||||||
|
def download_dandiset(i:int) -> DownloadResult:
|
||||||
|
id = DANDI_ID.format(i)
|
||||||
|
if not check_nwb(i) or i in SKIP_DANDISETS:
|
||||||
|
return DownloadResult(
|
||||||
|
id=id,
|
||||||
|
type=DownloadResultType.SKIP
|
||||||
|
)
|
||||||
|
id = DANDI_ID.format(i)
|
||||||
|
try:
|
||||||
|
download.main([
|
||||||
|
|
||||||
|
id,
|
||||||
|
'-o', str(OUT_DIR),
|
||||||
|
'--existing', 'refresh',
|
||||||
|
'--jobs', str(JOBS)
|
||||||
|
], standalone_mode=False
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return DownloadResult(
|
||||||
|
id=id,
|
||||||
|
type=DownloadResultType.ERROR,
|
||||||
|
err=e
|
||||||
|
)
|
||||||
|
|
||||||
|
return DownloadResult(
|
||||||
|
id=id,
|
||||||
|
type=DownloadResultType.GET
|
||||||
|
)
|
||||||
|
# continue
|
||||||
|
|
||||||
|
def argparser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
'-s', '--start',
|
||||||
|
description="Dandiset ID (int) to start scraping (default 1)",
|
||||||
|
default=1,
|
||||||
|
type=int
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
args = argparser().parse_args()
|
||||||
|
|
||||||
for i in trange(MAX_DANDISET):
|
for i in trange(args.start, MAX_DANDISET):
|
||||||
if not check_nwb(i) or i in SKIP_DANDISETS:
|
|
||||||
with open(LOG_TXT, 'a') as lfile:
|
|
||||||
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
|
|
||||||
continue
|
|
||||||
id = DANDI_ID.format(i)
|
|
||||||
try:
|
try:
|
||||||
download.main([
|
res = download_dandiset(i)
|
||||||
|
if res.type == DownloadResultType.SKIP:
|
||||||
id,
|
with open(LOG_TXT, 'a') as lfile:
|
||||||
'-o', str(OUT_DIR),
|
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
|
||||||
'--existing', 'refresh',
|
elif res.type == DownloadResultType.ERROR:
|
||||||
'--jobs', str(JOBS)
|
with open(LOG_TXT, 'a') as lfile:
|
||||||
], standalone_mode=False
|
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
|
||||||
)
|
lfile.write(str(res.err))
|
||||||
print('\nafterdl')
|
elif res.type == DownloadResultType.GET:
|
||||||
with open(LOG_TXT, 'a') as lfile:
|
with open(LOG_TXT, 'a') as lfile:
|
||||||
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
|
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
sys.exit(1)
|
break
|
||||||
except Exception as e:
|
|
||||||
print('\nexception')
|
|
||||||
with open(LOG_TXT, 'a') as lfile:
|
|
||||||
lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
|
|
||||||
lfile.write(str(e))
|
|
||||||
# continue
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue