scrape-dandi/scrape_dandi/__init__.py

from pathlib import Path
from tqdm import trange
from datetime import datetime

from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields
from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from dandi.cli.cmd_download import download


OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')
#OUT_DIR = '.'
LOG_TXT = 'log.txt'
SKIP_DANDISETS = [
    '000108' # humongous 372 human light sheet imaging
]
DANDI_ID = 'DANDI:{:06d}'
MAX_DANDISET = 683


def check_nwb(dandiset:int) -> bool:
    if dandiset == 108:
        return False

    id = DANDI_ID.format(dandiset)
    try:
        url = parse_dandi_url(id)
        with url.navigate(strict=True) as (c, dandiset,assets):
            is_nwb = any([a.path.endswith('nwb') for a in assets])
            is_not_draft = dandiset.version.identifier != 'draft'
            return is_nwb and is_not_draft
    except:
        return False


def main():

    for i in trange(MAX_DANDISET):

        if not check_nwb(i):
            with open(LOG_TXT, 'a') as lfile:
                lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")
            continue
        id = DANDI_ID.format(i)
        download(
            [
                id,
                '-o', str(OUT_DIR),
                '--existing', 'refresh',
                '--jobs', '24'
             ]
        )
        with open(LOG_TXT, 'a') as lfile:
            lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")
initial and probably only 2023-10-24 06:57:44 +00:00			`from pathlib import Path`
			`from tqdm import trange`
			`from datetime import datetime`

			`from dandi.consts import ZARR_EXTENSIONS, metadata_all_fields`
			`from dandi.dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url`
			`from dandi.cli.cmd_download import download`


that was a lie lmao 2023-10-24 06:58:07 +00:00			`OUT_DIR = Path('/mnt/seedbank/p2p/dandi/')`
			`#OUT_DIR = '.'`
initial and probably only 2023-10-24 06:57:44 +00:00			`LOG_TXT = 'log.txt'`
			`SKIP_DANDISETS = [`
			`'000108' # humongous 372 human light sheet imaging`
			`]`
			`DANDI_ID = 'DANDI:{:06d}'`
			`MAX_DANDISET = 683`



			`def check_nwb(dandiset:int) -> bool:`
			`if dandiset == 108:`
			`return False`

			`id = DANDI_ID.format(dandiset)`
			`try:`
			`url = parse_dandi_url(id)`
			`with url.navigate(strict=True) as (c, dandiset,assets):`
			`is_nwb = any([a.path.endswith('nwb') for a in assets])`
			`is_not_draft = dandiset.version.identifier != 'draft'`
			`return is_nwb and is_not_draft`
			`except:`
			`return False`


			`def main():`

			`for i in trange(MAX_DANDISET):`

			`if not check_nwb(i):`
			`with open(LOG_TXT, 'a') as lfile:`
			`lfile.write(f"{datetime.now().isoformat()} - {i:03d} - SKIP\n")`
			`continue`
			`id = DANDI_ID.format(i)`
			`download(`
			`[`
			`id,`
			`'-o', str(OUT_DIR),`
			`'--existing', 'refresh',`
			`'--jobs', '24'`
			`]`
			`)`
			`with open(LOG_TXT, 'a') as lfile:`
			`lfile.write(f"{datetime.now().isoformat()} - {i:03d} - GET\n")`