From 2f1781ada00bdd5d4a0181d669f8f4b8d74fccc7 Mon Sep 17 00:00:00 2001 From: flu0r1ne Date: Mon, 26 Sep 2022 01:32:10 -0500 Subject: Add basic indexes --- clnindexdl.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 clnindexdl.py (limited to 'clnindexdl.py') diff --git a/clnindexdl.py b/clnindexdl.py new file mode 100755 index 0000000..5b1f537 --- /dev/null +++ b/clnindexdl.py @@ -0,0 +1,75 @@ +#!/bin/env python3 + +import csv +import subprocess +import sys +from pathlib import Path +from multiprocessing import Pool +import time + +def md5sum(path): + ''' + Compute MD5 message digest and return hex digest + ''' + res = subprocess.run(['md5sum', path], capture_output=True, check=True) + sums = str(res.stdout, encoding='ascii') + + if sums != '': + return sums.split()[0] + +def flatten_filename(ftp_url): + ''' + Transform URL into a unique filename: + + ftp://domain.ext/some/path/read_file.gz -> some__path__read_file.gz + ''' + _, ext = ftp_url.split('//') + _, *path = ext.split('/') + return '__'.join(path) + +def wget_and_verify( ftp_url, md5, output_dir ): + basename = flatten_filename(ftp_url) + + filename = output_dir / basename + + sums = md5sum(filename) + + if sums == md5: + print(f'[{filename}] skipping already exists') + return + + print(f'[{filename}] : starting download') + + retry = 0 + while retry < 3: + out = subprocess.run(['curl', ftp_url, '--output', filename], capture_output=True) + + if out.returncode != 0: + print(f'[{filename}] ERR: curl failed') + + sums = md5sum(filename) + + if sums != md5: + retry += 1 + print(f'RETRY {retry}: ERR: checksum mismatch {sums} != {md5}', file=sys.stderr) + else: + print(f'[{filepath}] : completed download') + return + + print(f'ERR: MAXIMUM RETRIES EXCEEDED', file=sys.stderr) + +def download_all_from_index( index, output_dir, threads ): + + reader = csv.DictReader( index, delimiter='\t' ) + + with Pool(threads) as pool: + res = [] + + for files in reader: + r = pool.apply_async(wget_and_verify, (files['FASTQ'], files['FASTQ_MD5'], output_dir)) + res.append(r) + r = pool.apply_async(wget_and_verify, (files['PAIRED_FASTQ'], files['PAIRED_FASTQ_MD5'], output_dir)) + res.append(r) + + for r in res: + r.get() -- cgit v1.2.3