scrape-dandi/scrape_dandi/__init__.py

from pathlib import Path
from tqdm import trange
from datetime import datetime
import sys
import click
from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from dandi.cli.cmd_download import download


OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
#OUT_DIR = '.'
LOG_TXT = 'log.txt'
SKIP_DANDISETS = [
    108 # humongous 372 human light sheet imaging
]
DANDI_ID = 'DANDI:{:06d}'
MAX_DANDISET = 683
JOBS = 64


def check_nwb(dandiset:int) -> bool:
    if dandiset == 108:
        return False

    id = DANDI_ID.format(dandiset)
    try:
        url = parse_dandi_url(id)
        with url.navigate(strict=True) as (c, dandiset,assets):
            is_nwb = any([a.path.endswith('nwb') for a in assets])
            is_not_draft = dandiset.version.identifier != 'draft'
            return is_nwb and is_not_draft
    except:
        return False


def main():

    for i in trange(MAX_DANDISET):
        if not check_nwb(i) or i in SKIP_DANDISETS:
            with open(LOG_TXT, 'a') as lfile:
                lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
            continue
        id = DANDI_ID.format(i)
        try:
            download.main([

                    id,
                    '-o', str(OUT_DIR),
                    '--existing', 'refresh',
                    '--jobs', str(JOBS)
                 ], standalone_mode=False
            )
            print('\nafterdl')
            with open(LOG_TXT, 'a') as lfile:
                lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
        except KeyboardInterrupt:
            sys.exit(1)
        except Exception as e:
            print('\nexception')
            with open(LOG_TXT, 'a') as lfile:
                lfile.write(f"{datetime.now().isoformat()} - {i:03d} - ERROR\n")
                lfile.write(str(e))
            # continue