From b164ac63745e307bfef009400fc3191a27f7c96c Mon Sep 17 00:00:00 2001 From: flu0r1ne Date: Mon, 31 Oct 2022 05:42:30 -0500 Subject: restructure library --- main.c | 925 +++------------------------------------------------------------ makefile | 43 ++- qidx.c | 899 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ qidx.h | 70 +++++ 4 files changed, 1049 insertions(+), 888 deletions(-) create mode 100644 qidx.c create mode 100644 qidx.h diff --git a/main.c b/main.c index aa2f3d6..c56a951 100644 --- a/main.c +++ b/main.c @@ -1,735 +1,9 @@ #include #include -#include -#include -#include -#include -#include -#include - -#include "util/sds/sds.h" -#include "util/prime_search.h" -#include "util/hash.h" -#include "util/util.h" - -#include - -// @abstract Structure representing an unique alignment and virtual file pointer -// @field tid chromosome ID -// @field pos 0-based leftmost coordinate -// @field vptr virtual pointer -typedef struct { - uint32_t tid; - uint32_t pos; - uint64_t vptr; -} aln_spec_t; - - -#pragma pack(push, 1) - -typedef struct { - uint32_t tid; - uint32_t pos; - uint64_t vptr; -} aln_spec_packed_t; - -#pragma pack(pop) - -// constrains (qname < 2 ** 32) - -// @abstract Structure representing all alignments by queryname -// @field qname queryname (name for the given read) -// @field n_alns number of alignments -// @field alns array with alignments structures -typedef struct { - char * qname; - uint16_t n_alns; - aln_spec_t * alns; -} qidx_record_t; - -// @abstract Get the marshaled size of a record -static size_t qidx_record_msize(qidx_record_t const * q) { - return ( - strlen(q->qname) + 1 + - sizeof(uint16_t) + - sizeof(aln_spec_packed_t) * q->n_alns - ); -} - -// @abstract Martial a record to a fixed buffer -// -// @param q query index record -// @param dst destination buffer -// @param dstlen length of the destination buffer -// -// @return returns the number of bytes written to the buffer -// or < 0 if the destination buffer is exceeded -ssize_t qidx_record_marital(qidx_record_t const * q, void * dst, size_t dstlen) { - size_t rem = dstlen; - - size_t n = strlen(q->qname) + 1; - if(rem < n) - return -1; - rem -= n; - - { - char * _dst = dst; - strcpy(_dst, q->qname); - dst = (_dst + n); - } - - size_t sz = 0; - sz += sizeof(uint16_t); - sz += sizeof(aln_spec_packed_t) * q->n_alns; - - if(rem < sz) - return -1; - rem -= sz; - - { - uint16_t * _dst = dst; - *_dst = htole16(q->n_alns); - dst = _dst + 1; - } - - aln_spec_packed_t * buf_aln = dst; - for(size_t i = 0; i < q->n_alns; i++) { - aln_spec_t * aln = &q->alns[i]; - - buf_aln->tid = htole32(aln->tid); - buf_aln->pos = htole32(aln->pos); - buf_aln->vptr = htole64(aln->vptr); - - buf_aln++; - } - - return dstlen - rem; -} - -// @abstract Helper to parse the query name and the number of alignments -// @returns NULL if a parsing error occured else the location after the buffer -static void const * _parse_record_header(void const * buf, size_t buflen, - char const ** qname, uint16_t * n_alns) { - - size_t n = strnlen(buf, buflen) + 1; - *qname = buf; - - if(n + sizeof(uint16_t) > buflen) - return NULL; - - uint16_t const * _n_alns = (uint16_t const *) (*qname + n); - *n_alns = le64toh(*_n_alns); - buf = _n_alns + 1; - - if(sizeof(aln_spec_packed_t) * *n_alns > buflen) - return NULL; - - return buf; -} - -// @abstract Unmartial a query record. The resulting memory needs to be freed. -// All strings are sds strings. -// -// @param q query index record -// @param buf source buffer -// @param buflen source buffer length -// -// @return a pointer within buf after the record -// or NULL if a parsing error occurred -void const * qidx_record_unmartial(qidx_record_t * q, void const * buf, size_t buflen) { - char const * qname; - - if(!(buf = _parse_record_header(buf, buflen, &qname, &q->n_alns))) { - return NULL; - } - - q->qname = sdsnew(qname); - q->alns = calloc(q->n_alns, sizeof(aln_spec_t)); - - aln_spec_packed_t const * buf_aln = buf; - for(size_t i = 0; i < q->n_alns; i++) { - aln_spec_t * aln = &q->alns[i]; - - aln->pos = le32toh(buf_aln->pos); - aln->tid = le32toh(buf_aln->tid); - aln->vptr = le64toh(buf_aln->vptr); - - buf_aln++; - } - - return buf_aln; -} - -// @abstract Find a record efficiently in a marshaled buffer of query index records -// -// @param buf source buffer -// @param buflen length of the source buffer -// @param qname target queryname -// -// @returns a pointer to a target query or NULL if it a record with the target query name -// does not exist in the buffer -void const * qidx_record_martial_find(void const * buf, size_t buflen, char const * qname) { - size_t n = strnlen(buf, buflen); - - char const * _qname; - uint16_t n_alns; - - void const * cur = buf, - * next = NULL; - - while((next = _parse_record_header(cur, buflen, &_qname, &n_alns))) { - - if(strcmp(_qname, qname) == 0) { - return cur; - } - - next = (aln_spec_packed_t *) next + n_alns; - size_t l = (char const *) next - (char const *) cur; - buflen -= l; - cur = next; - - if(!buflen) { - return NULL; - } - } - - // Invalid - return NULL; -} - -typedef enum { - QIDX_OK = 0, - QIDX_ITER_DONE, - QIDX_NO_MEM, - QIDX_NO_BAM_HDR, - QIDX_BAM_READ_FAILURE, - QIDX_REC_ITER_FAILURE, - QIDX_NOT_SORTED, - QIDX_BUCKET_MAX_SIZE_EXCEEDED, - QIDX_MAP_FAIL, - QIDX_IO_FAIL, - QIDX_FAILED_TO_OPEN_BAMFILE, - QIDX_FAILED_TO_OPEN_INDEX_FILE, - QIDX_INVALID_VERSION, - QIDX_INVALID_MAGIC, -} qidx_err_t; - -bool qidx_errno(qidx_err_t err) { - switch(err) { - case QIDX_MAP_FAIL: - case QIDX_IO_FAIL: - return true; - default: - return false; - } - -} - -char const * qidx_strerr(qidx_err_t err) { - switch(err) { - case QIDX_NO_MEM: return "memory was exausted"; - case QIDX_NO_BAM_HDR: return "parsing BAM header failed"; - case QIDX_BAM_READ_FAILURE: return "failed to read bamfile"; - case QIDX_REC_ITER_FAILURE: return "bam query index record iterator failed"; - case QIDX_NOT_SORTED: return "input bam file not sorted by query name"; - case QIDX_BUCKET_MAX_SIZE_EXCEEDED: return "maximum query size exceeded"; - case QIDX_MAP_FAIL: return "mmap failed"; - case QIDX_IO_FAIL: return "qidx io failure"; - case QIDX_FAILED_TO_OPEN_BAMFILE: return "failed to open bamfile"; - case QIDX_FAILED_TO_OPEN_INDEX_FILE: return "failed to open indexfile"; - case QIDX_INVALID_MAGIC: return "invalid magic number (not a qidx or corrupted)"; - case QIDX_INVALID_VERSION: return "qidx version outdated, please regenerate"; - case QIDX_ITER_DONE: - case QIDX_OK: - return "OK"; - } -} - -qidx_err_t init_qidx(qidx_record_t * r) { - return QIDX_OK; -} - -void qidx_free(qidx_record_t * r) { - free(r); -} - -/* - * SAMTOOLS SORTING ALGORITHM - */ - -// strnum_cmp is the string comparator used by samtools sort -// to sort bam files by read names (e.g. QNAME) -// -// The input file is expected to be ordered accordingly. This -// is verified at runtime. -#define is_digit(c) ((c)<='9' && (c)>='0') -static int strnum_cmp(const char *_a, const char *_b) -{ - const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; - const unsigned char *pa = a, *pb = b; - while (*pa && *pb) { - if (!is_digit(*pa) || !is_digit(*pb)) { - if (*pa != *pb) - return (int)*pa - (int)*pb; - ++pa; ++pb; - } else { - // skip leading zeros - while (*pa == '0') ++pa; - while (*pb == '0') ++pb; - - // skip matching digits - while (is_digit(*pa) && *pa == *pb) - pa++, pb++; - - // Now mismatching, so see which ends the number sooner - int diff = (int)*pa - (int)*pb; - while (is_digit(*pa) && is_digit(*pb)) - pa++, pb++; - - if (is_digit(*pa)) - return 1; // pa still going, so larger - else if (is_digit(*pb)) - return -1; // pb still going, so larger - else if (diff) - return diff; // same length, so earlier diff - } - } - return *pa? 1 : *pb? -1 : 0; -} - - -typedef struct { - htsFile *fp; - bam1_t * b; - bam_hdr_t * hdr; - - uint64_t file_offset; - sds queryname; - int ret; - - aln_spec_t * aln_specs; - uint16_t n_aln_cap; - uint16_t n_aln_sz; - - qidx_record_t * pending_release; -} qidx_record_iter_t; - -qidx_err_t qidx_record_iter_init(qidx_record_iter_t *it, htsFile *fp) { - it->b = bam_init1(); - it->fp = fp; - - // advance past header - it->hdr = sam_hdr_read(fp); - - if(!it->hdr) { - return QIDX_NO_BAM_HDR; - } - - it->file_offset = bgzf_tell(fp->fp.bgzf); - - if((it->ret = sam_read1(it->fp, it->hdr, it->b)) < 0) { - return QIDX_BAM_READ_FAILURE; - } - - it->queryname = sdsnew(bam_get_qname(it->b)); - it->aln_specs = NULL; - it->n_aln_sz = it->n_aln_cap = 0; - - if(bgzf_seek(fp->fp.bgzf, it->file_offset, SEEK_SET) < 0) { - return QIDX_IO_FAIL; - } - - it->pending_release = NULL; - - return QIDX_OK; -} - -static void _qidx_record_iter_free_record(qidx_record_iter_t * it) { - if(it->pending_release) { - free(it->pending_release->alns); - sdsfree(it->pending_release->qname); - it->pending_release = NULL; - } -} - -qidx_err_t qidx_record_iter_next(qidx_record_iter_t * it, qidx_record_t * rec) { - - _qidx_record_iter_free_record(it); - - sds queryname; - int cmp; - int ret; - bool yield = false; - while ((ret = sam_read1(it->fp, it->hdr, it->b)) >= 0) { - queryname = sdsnew(bam_get_qname(it->b)); - - if(!queryname) { - return QIDX_NO_MEM; - } - - if((cmp = strnum_cmp(it->queryname, queryname)) != 0) { - if(cmp > 0) { - printf("Err: the file is not sorted\n"); - return QIDX_NOT_SORTED; - } - - rec->n_alns = it->n_aln_sz; - rec->alns = it->aln_specs; - rec->qname = it->queryname; - - it->pending_release = rec; - - yield = true; - - it->queryname = sdsdup(queryname); - it->aln_specs = NULL; - it->n_aln_cap = it->n_aln_sz = 0; - } - - AM_RESIZE(it->aln_specs, it->n_aln_sz + 1, it->n_aln_cap); - - it->aln_specs[it->n_aln_sz].pos = it->b->core.pos; - it->aln_specs[it->n_aln_sz].tid = it->b->core.tid; - it->aln_specs[it->n_aln_sz].vptr = it->file_offset; - - it->n_aln_sz++; - - // update offset for next record - it->file_offset = bgzf_tell(it->fp->fp.bgzf); - - sdsfree(queryname); - - if(yield) { - break; - } - } - - if(ret < -1) { - it->pending_release = NULL; - return QIDX_BAM_READ_FAILURE; - } else if (ret == -1 && it->queryname != NULL) { - rec->n_alns = it->n_aln_sz; - rec->alns = it->aln_specs; - rec->qname = it->queryname; - - it->pending_release = rec; - - yield = true; - - it->queryname = NULL; - it->aln_specs = NULL; - it->n_aln_cap = it->n_aln_sz = 0; - } - - if(!yield) { - return QIDX_ITER_DONE; - } - - return QIDX_OK; -} - -qidx_err_t qidx_record_iter_destroy(qidx_record_iter_t * it) { - _qidx_record_iter_free_record(it); - - sdsfree(it->queryname); - - bam_hdr_destroy(it->hdr); - bam_destroy1(it->b); - - return QIDX_OK; -} - -#include - -// @abstract Structure containing summary statistics estimated using -// a one-pass algorithm -// @field mu mean -// @field s sample variance -// @field n number of entries -typedef struct { - double mu; - double s; - size_t n; -} roll_est_params_t; - -// @abstract observe a data point -void roll_est_add_evidence(roll_est_params_t * p, double x) { - double delta = x - p->mu; - p->mu += delta / (p->n + 1); - p->s += (x - p->mu) * delta; - p->n++; -} - -// @abstract finalize the estimate -void roll_est_finalize(roll_est_params_t * p) { - p->s /= p->n; -} - -// Calculates the maximum number of items placed into a single bucket -// to not exceed the bucket_size with a probability specified by the -// area to the right of the positive z-score z. E.g. at z=2 there is -// 2.4% chance of the soft limit being exceeded given objects are -// assigned uniformly to each bucket. (The cardinality of each bucket -// is the same.) -// -// In order for the maximum size to (really) not be exceeded, the -// binomial distribution of items in each bucket would need to be -// included. It is recommended to set this soft limit much lower -// than the hard limit to not exceed it. -// -// This is derived from the standard error as measured by the sampling -// distribution. - -// @param soft_limit a soft limit on the number of variable-length items -// stuffed into a single bucket -// @param z the z-score with which this will happen -// @param p the parameters of the normal distribution -size_t estimate_n_buckets(double soft_limit, double z, roll_est_params_t const * p) { - double n = sqrt(z * z * p->s + 4 * p->mu * soft_limit) - z * sqrt(p->s); - n = n / (2 * p->mu); - - int max_bucket_size = floor(n * n); - - return (size_t) ceil((double) p->n / (double) max_bucket_size); -} - -typedef struct { - uint64_t offset; - uint32_t n_bytes; -} qidx_bucket_t; - -typedef struct { - uint32_t n_buckets; - uint32_t max_bucket_size; - qidx_bucket_t * buckets; - char * rb_start; -} qidx_htable_t; - -#define QIDX_MAGIC 0x5d1de6b4 -#define QIDX_VERSION 1 - -#pragma pack(push, 1) - -typedef struct { - uint32_t magic; - uint16_t version; - uint32_t n_buckets; - uint32_t max_bucket_size; -} qidx_header_t; - -#pragma pack(pop) - -typedef struct { - int fd; - void * map; - size_t len; - bool read_only; - qidx_header_t * hdr; - qidx_bucket_t * buckets; - qidx_htable_t * htab; -} qidx_fp_t; - -size_t qidx_file_size(uint32_t n_buckets, uint32_t max_bucket_size) { - size_t sz = 0; - - sz += sizeof(qidx_header_t); - sz += sizeof(qidx_htable_t) * n_buckets; - sz += (uint64_t) n_buckets * (uint64_t) max_bucket_size; - - return sz; -} - -#include #include -// @abstract initialize htab, must first populate fp->buckets -qidx_err_t _qidx_init_htab(qidx_fp_t * fp, - uint32_t n_buckets, uint32_t max_bucket_size) { - qidx_htable_t * htab = malloc(sizeof(qidx_htable_t)); - if(!htab) { - return QIDX_NO_MEM; - } - - htab->buckets = calloc(sizeof(qidx_bucket_t), n_buckets); - if(!htab->buckets) { - free(htab); - return QIDX_NO_MEM; - } - - htab->n_buckets = n_buckets; - htab->max_bucket_size = max_bucket_size; - - htab->rb_start = (void *) (fp->buckets + n_buckets); - - fp->htab = htab; - - return QIDX_OK; -} - - -qidx_err_t qidx_open(qidx_fp_t * fp, int fd) { - - fp->fd = fd; - - qidx_header_t hdr; - if(read(fd, &hdr, sizeof(qidx_header_t)) == -1) { - return QIDX_FAILED_TO_OPEN_INDEX_FILE; - } - - if(le32toh(hdr.magic) != QIDX_MAGIC) { - return QIDX_INVALID_MAGIC; - } - - if(le16toh(hdr.version) != QIDX_VERSION) { - return QIDX_INVALID_VERSION; - } - - uint32_t max_bucket_size = le32toh(hdr.max_bucket_size); - uint32_t n_buckets = le32toh(hdr.n_buckets); - - size_t sz = qidx_file_size(n_buckets, max_bucket_size); - - if(lseek(fd, 0, SEEK_SET) == -1) { - return QIDX_IO_FAIL; - } - - fp->read_only = true; - fp->map = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); - fp->len = sz; - - if(fp->map == MAP_FAILED) { - return QIDX_MAP_FAIL; - } - - fp->hdr = fp->map; - fp->buckets = (qidx_bucket_t *)(fp->hdr + 1); - - qidx_err_t err = _qidx_init_htab(fp, n_buckets, max_bucket_size); - if(err) { return err; } - - for(size_t i = 0; i < n_buckets; i++) { - fp->htab->buckets[i].n_bytes = le32toh(fp->buckets[i].n_bytes); - fp->htab->buckets[i].offset = le64toh(fp->buckets[i].offset); - } - - return QIDX_OK; -} - -qidx_err_t qidx_create(qidx_fp_t * fp, int fd, uint32_t _n_buckets, uint32_t max_bucket_size) { - - uint32_t n_buckets = next_prime_above_lowerbound(_n_buckets); - - size_t sz = qidx_file_size(n_buckets, max_bucket_size); - - fp->fd = fd; - - if(ftruncate(fd, sz) == -1) { - return QIDX_IO_FAIL; - } - - fp->read_only = false; - fp->map = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - fp->len = sz; - - if(fp->map == MAP_FAILED) { - return QIDX_MAP_FAIL; - } - - fp->hdr = fp->map; - - fp->hdr->magic = htole32(QIDX_MAGIC); - fp->hdr->version = htole16(QIDX_VERSION); - fp->hdr->n_buckets = htole32(n_buckets); - fp->hdr->max_bucket_size = htole32(max_bucket_size); - - fp->buckets = (qidx_bucket_t *) (fp->hdr + 1); - - _qidx_init_htab(fp, n_buckets, max_bucket_size); - - uint64_t off = (char *) fp->htab->rb_start - (char *) fp->map; - for(size_t i = 0; i < n_buckets; i++) { - fp->htab->buckets[i].offset = off; - fp->htab->buckets[i].n_bytes = 0; - off += max_bucket_size; - } - - return QIDX_OK; -} - -qidx_err_t qidx_close(qidx_fp_t * fp) { - - qidx_htable_t * htab = fp->htab; - - if(!fp->read_only) { - for(size_t i = 0; i < htab->n_buckets; i++) { - fp->buckets[i].n_bytes = htole32(htab->buckets[i].n_bytes); - fp->buckets[i].offset = htole64(htab->buckets[i].offset); - } - } - - free(htab->buckets); - free(htab); - - if(munmap(fp->map, fp->len) != -1) { - return QIDX_MAP_FAIL; - } - - return QIDX_OK; -} - -qidx_htable_t * qidx_get_htable(qidx_fp_t * fp) { return fp->htab; } - -static qidx_bucket_t * _qidx_bucket(char const * qname, qidx_htable_t const * tab) { - size_t hash = murmur_hash_str_64s(qname, 0x12345); - - qidx_bucket_t * bucket = &tab->buckets[hash % tab->n_buckets]; - - return bucket; -} - -qidx_err_t qidx_table_add(qidx_htable_t * tab, qidx_record_t const * rec) { - qidx_bucket_t * bucket = _qidx_bucket(rec->qname, tab); - - void * ptr = (char *) tab->rb_start + bucket->offset + bucket->n_bytes; - size_t rem = tab->max_bucket_size - bucket->n_bytes; - - ssize_t wrlen = qidx_record_marital(rec, ptr, rem); - - if(wrlen < 0) { - return QIDX_BUCKET_MAX_SIZE_EXCEEDED; - } - - bucket->n_bytes += wrlen; - - return QIDX_OK; -} - -qidx_err_t qidx_table_get(qidx_htable_t const * tab, - char const * qname, qidx_record_t ** rec) { - qidx_bucket_t * bucket = _qidx_bucket(qname, tab); - - void const * bucket_start = tab->rb_start + bucket->offset; - void const * record_start = - qidx_record_martial_find(bucket_start, bucket->n_bytes, qname); - - if(!record_start) { - *rec = NULL; - } else { - size_t rem = bucket->n_bytes - - ((char const *) record_start - (char const *) bucket_start); - *rec = malloc(sizeof(qidx_record_t)); - - if(!*rec) { return QIDX_NO_MEM; } - - void const * next = qidx_record_unmartial(*rec, record_start, rem); - - if(!next) { - return QIDX_IO_FAIL; - } - } - - return QIDX_OK; -} +#include "util/util.h" +#include "qidx.h" void qidx_die_err(qidx_err_t err) { char const * errstr = qidx_strerr(err); @@ -740,160 +14,10 @@ void qidx_die_err(qidx_err_t err) { } } -void qidx_record_print(FILE * file, qidx_record_t * rec) { - fprintf(file, "qname: %s\n", rec->qname); - for(size_t i = 0; i < rec->n_alns; i++) { - fprintf(file, "\tTID: %u\n", rec->alns[i].tid); - fprintf(file, "\tPOS: %u\n", rec->alns[i].pos); - fprintf(file, "\tVPTR: %lu\n", rec->alns[i].vptr); - } -} - -qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile) { - qidx_err_t err; - - htsFile *b_fp = hts_open(bamfile, "r"); - if(b_fp == NULL) { return QIDX_FAILED_TO_OPEN_BAMFILE; } - - // Pass 1: - // - estimate mu, s, n of the distribution of record sizes - // - estimate the number of buckets needed in the hash table - qidx_record_iter_t it; - if((err = qidx_record_iter_init(&it, b_fp))) { - return err; - } - - roll_est_params_t params; - memset(¶ms, 0, sizeof(roll_est_params_t)); - qidx_record_t rec; - while(!(err = qidx_record_iter_next(&it, &rec))) { - double sz = (double) qidx_record_msize(&rec); - roll_est_add_evidence(¶ms, sz); - } - if(err != QIDX_ITER_DONE) return err; - - if ((err = qidx_record_iter_destroy(&it))) { - return err; - } - - roll_est_finalize(¶ms); - - double soft_limit = 8 * 1024; - double z = 5; - uint32_t n_buckets = estimate_n_buckets(soft_limit, z, ¶ms); - uint32_t max_bucket_size = 1 << 24; - - printf("record size (mean): %f\n", params.mu); - printf("record size (variance): %f\n", params.s); - printf("number of records: %zu\n", params.n); - printf("number of buckets: %u\n", n_buckets); - - FILE * qidx = fopen(indexfile, "wb+"); - - if(!qidx) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } - - int fd = fileno(qidx); - - // PASS 2: - // - add records to index - - // Rewind to start of BAM file - if (-1 == bgzf_seek(b_fp->fp.bgzf, 0, SEEK_SET)) { - return QIDX_IO_FAIL; - } - - qidx_fp_t q_fp; - if((err = qidx_create(&q_fp, fd, n_buckets, max_bucket_size))) { - qidx_die_err(err); - } - - qidx_htable_t * htab = qidx_get_htable(&q_fp); - - if ((err = qidx_record_iter_init(&it, b_fp))) { - return err; - } - - while((err = qidx_record_iter_next(&it, &rec)) == QIDX_OK) { - qidx_table_add(htab, &rec); - } - if(err != QIDX_ITER_DONE) return err; - - if ((err = qidx_record_iter_destroy(&it))) { - return err; - } - - qidx_close(&q_fp); - fclose(qidx); - hts_close(b_fp); - - return QIDX_OK; -} - -qidx_err_t qidx_search_index(char const * indexfile, - char const * bamfile, char const * qname) { - FILE * index = fopen(indexfile, "rb"); - if(!index) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } - - int fd = fileno(index); - - qidx_fp_t q_fp; - qidx_err_t err = qidx_open(&q_fp, fd); - if(err) return err; - - qidx_htable_t * htab = qidx_get_htable(&q_fp); - - qidx_record_t * rec; - if((err = qidx_table_get(htab, qname, &rec))) { - return err; - } - - if(rec) { - htsFile * fp = hts_open(bamfile, "r"); - htsFile* out_fp = hts_open("-", "w"); - - - if(!fp) { - return QIDX_FAILED_TO_OPEN_BAMFILE; - } - - bam1_t * b = bam_init1(); - bam_hdr_t * hdr = sam_hdr_read(fp); - for(size_t i = 0; i < rec->n_alns; i++) { - aln_spec_t * aln = &rec->alns[i]; - - if(bgzf_seek(fp->fp.bgzf, aln->vptr, SEEK_SET) < 0) { - return QIDX_IO_FAIL; - } - - int ret; - if((ret = sam_read1(fp, hdr, b)) >= 0) { - - int ret = sam_write1(out_fp, hdr, b); - if(ret < 0) { - return QIDX_IO_FAIL; - } - } - } - - sdsfree(rec->qname); - free(rec->alns); - free(rec); - - bam_hdr_destroy(hdr); - bam_destroy1(b); - hts_close(out_fp); - hts_close(fp); - } else { - fprintf(stderr, "Could not find read: %s\n", qname); - } - - return QIDX_OK; -} - void die_usage(char const * prog) { err("%s {create,search}", prog); err(" create [bamfile] [indexfile]"); - die(" search [index] [read]"); + die(" search [index] [bamfile] [read]"); } int main(int argc, char ** argv) { @@ -904,23 +28,54 @@ int main(int argc, char ** argv) { char const * type = argv[1]; qidx_err_t err; + char const * prog = argv[0]; if(strcmp(type, "create") == 0) { - if(argc != 4) die_usage(argv[0]); - char const * bamfile = argv[2]; - char const * indexfile = argv[3]; - err = qidx_create_index(bamfile, indexfile); - if(err) { + int option; + uint32_t n_buckets = 0; + uint32_t max_bucket_size = 1 << 18; + + while((option = getopt(argc, argv, ":b:s:")) != -1) { + switch(option) { + case 'b': + if(1 != sscanf(optarg, "%u", &n_buckets)) { + die("could not parse -b"); + } + break; + case 's': + if(1 != sscanf(optarg, "%u", &max_bucket_size)) { + die("could not parse -s"); + } + break; + case ':': + die("option needs a value"); + case '?': + die("Unknown option %c", optopt); + break; + } + } + + if(argc - optind != 3) die_usage(prog); + + char const * bamfile = argv[optind + 1]; + char const * indexfile = argv[optind + 2]; + + if((err = qidx_create_index1(bamfile, + indexfile, n_buckets, max_bucket_size))) { qidx_die_err(err); } + } else if (strcmp(type, "search") == 0) { - if(argc != 5) die_usage(argv[0]); + if(argc != 5) die_usage(prog); + char const * indexfile = argv[2]; char const * bamfile = argv[3]; char const * qname = argv[4]; + if((err = qidx_search_index(indexfile, bamfile, qname))) { qidx_die_err(err); } + } else { die_usage(argv[0]); } diff --git a/makefile b/makefile index e412184..8328469 100644 --- a/makefile +++ b/makefile @@ -1,3 +1,40 @@ -main: - gcc main.c ./util/hash.c ./util/prime_search.c -lhts -lm ./util/sds/sds.c ./util/err.c -g -o qidx -fsanitize=address -.PHONY: main +ifeq ($(PREFIX),) + PREFIX := /usr/local +endif + +ifeq ($(BINDIR),) + BINDIR := /bin +endif + +CMD=qidx + +CC=gcc + +DEBUG := -fsanitize=address -O2 + +CFLAGS := -Wall -pedantic -g +LDFLAGS := -lhts -lm + +SRC_EXT:=%.c + +_STD_BUILD=$(CC) $(CFLAGS) $(filter $(SRC_EXT) %.o, $^) -o $@ +STD_BUILD=$(_STD_BUILD) $(LDFLAGS) +STD_COMPILE=$(_STD_BUILD) -c + +OBJS = ./util/hash.o +OBJS += ./util/prime_search.o +OBJS += ./util/prime_search.o +OBJS += ./util/sds/sds.o +OBJS += ./util/err.o +OBJS += ./qidx.o + +make: $(CMD) + +$(CMD): ./main.c $(OBJS) + $(STD_BUILD) + +clean: + $(RM) $(CMD) $(OBJS) + +%.o: %.c + $(STD_COMPILE) diff --git a/qidx.c b/qidx.c new file mode 100644 index 0000000..bcee165 --- /dev/null +++ b/qidx.c @@ -0,0 +1,899 @@ +#include "qidx.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/util.h" +#include "util/sds/sds.h" +#include "util/prime_search.h" +#include "util/hash.h" + +#pragma pack(push, 1) + +typedef struct { + uint32_t tid; + uint32_t pos; + uint64_t vptr; +} aln_spec_packed_t; + +#pragma pack(pop) + +// @abstract Get the marshaled size of a record +static size_t qidx_record_msize(qidx_record_t const * q) { + return ( + strlen(q->qname) + 1 + + sizeof(uint16_t) + + sizeof(aln_spec_packed_t) * q->n_alns + ); +} + +// @abstract Martial a record to a fixed buffer +// +// @param q query index record +// @param dst destination buffer +// @param dstlen length of the destination buffer +// +// @return returns the number of bytes written to the buffer +// or < 0 if the destination buffer is exceeded +ssize_t qidx_record_marital(qidx_record_t const * q, void * dst, size_t dstlen) { + size_t rem = dstlen; + + size_t n = strlen(q->qname) + 1; + if(rem < n) + return -1; + rem -= n; + + { + char * _dst = dst; + strcpy(_dst, q->qname); + dst = (_dst + n); + } + + size_t sz = 0; + sz += sizeof(uint16_t); + sz += sizeof(aln_spec_packed_t) * q->n_alns; + + if(rem < sz) + return -1; + rem -= sz; + + { + uint16_t * _dst = dst; + *_dst = htole16(q->n_alns); + dst = _dst + 1; + } + + aln_spec_packed_t * buf_aln = dst; + for(size_t i = 0; i < q->n_alns; i++) { + aln_spec_t * aln = &q->alns[i]; + + buf_aln->tid = htole32(aln->tid); + buf_aln->pos = htole32(aln->pos); + buf_aln->vptr = htole64(aln->vptr); + + buf_aln++; + } + + return dstlen - rem; +} + +// @abstract Helper to parse the query name and the number of alignments +// @returns NULL if a parsing error occured else the location after the buffer +static void const * _parse_record_header(void const * buf, size_t buflen, + char const ** qname, uint16_t * n_alns) { + + size_t n = strnlen(buf, buflen) + 1; + *qname = buf; + + if(n + sizeof(uint16_t) > buflen) + return NULL; + + uint16_t const * _n_alns = (uint16_t const *) (*qname + n); + *n_alns = le64toh(*_n_alns); + buf = _n_alns + 1; + + if(sizeof(aln_spec_packed_t) * *n_alns > buflen) + return NULL; + + return buf; +} + +// @abstract Unmartial a query record. The resulting memory needs to be freed. +// All strings are sds strings. +// +// @param q query index record +// @param buf source buffer +// @param buflen source buffer length +// +// @return a pointer within buf after the record +// or NULL if a parsing error occurred +void const * qidx_record_unmartial(qidx_record_t * q, void const * buf, size_t buflen) { + char const * qname; + + if(!(buf = _parse_record_header(buf, buflen, &qname, &q->n_alns))) { + return NULL; + } + + q->qname = sdsnew(qname); + q->alns = calloc(q->n_alns, sizeof(aln_spec_t)); + + aln_spec_packed_t const * buf_aln = buf; + for(size_t i = 0; i < q->n_alns; i++) { + aln_spec_t * aln = &q->alns[i]; + + aln->pos = le32toh(buf_aln->pos); + aln->tid = le32toh(buf_aln->tid); + aln->vptr = le64toh(buf_aln->vptr); + + buf_aln++; + } + + return buf_aln; +} + +// @abstract Find a record efficiently in a marshaled buffer of query index records +// +// @param buf source buffer +// @param buflen length of the source buffer +// @param qname target queryname +// +// @returns a pointer to a target query or NULL if it a record with the target query name +// does not exist in the buffer +void const * qidx_record_martial_find(void const * buf, size_t buflen, char const * qname) { + char const * _qname; + uint16_t n_alns; + + void const * cur = buf, + * next = NULL; + + while((next = _parse_record_header(cur, buflen, &_qname, &n_alns))) { + + if(strcmp(_qname, qname) == 0) { + return cur; + } + + next = (aln_spec_packed_t *) next + n_alns; + size_t l = (char const *) next - (char const *) cur; + buflen -= l; + cur = next; + + if(!buflen) { + return NULL; + } + } + + // Invalid + return NULL; +} + +bool qidx_errno(qidx_err_t err) { + switch(err) { + case QIDX_MAP_FAIL: + case QIDX_IO_FAIL: + return true; + default: + return false; + } +} + +char const * qidx_strerr(qidx_err_t err) { + switch(err) { + case QIDX_NO_MEM: return "memory was exausted"; + case QIDX_NO_BAM_HDR: return "parsing BAM header failed"; + case QIDX_BAM_READ_FAILURE: return "failed to read bamfile"; + case QIDX_REC_ITER_FAILURE: return "bam query index record iterator failed"; + case QIDX_NOT_SORTED: return "input bam file not sorted by query name"; + case QIDX_BUCKET_MAX_SIZE_EXCEEDED: return "maximum query size exceeded"; + case QIDX_MAP_FAIL: return "mmap failed"; + case QIDX_IO_FAIL: return "qidx io failure"; + case QIDX_FAILED_TO_OPEN_BAMFILE: return "failed to open bamfile"; + case QIDX_FAILED_TO_OPEN_INDEX_FILE: return "failed to open indexfile"; + case QIDX_INVALID_MAGIC: return "invalid magic number (not a qidx or corrupted)"; + case QIDX_INVALID_VERSION: return "qidx version outdated, please regenerate"; + case QIDX_ITER_DONE: + case QIDX_OK: + return "OK"; + default: + return "Unknown"; + } + +} + +/* + * SAMTOOLS SORTING ALGORITHM + */ + +// strnum_cmp is the string comparator used by samtools sort +// to sort bam files by read names (e.g. QNAME) +// +// The input file is expected to be ordered accordingly. This +// is verified at runtime. +#define is_digit(c) ((c)<='9' && (c)>='0') +static int strnum_cmp(const char *_a, const char *_b) +{ + const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; + const unsigned char *pa = a, *pb = b; + while (*pa && *pb) { + if (!is_digit(*pa) || !is_digit(*pb)) { + if (*pa != *pb) + return (int)*pa - (int)*pb; + ++pa; ++pb; + } else { + // skip leading zeros + while (*pa == '0') ++pa; + while (*pb == '0') ++pb; + + // skip matching digits + while (is_digit(*pa) && *pa == *pb) + pa++, pb++; + + // Now mismatching, so see which ends the number sooner + int diff = (int)*pa - (int)*pb; + while (is_digit(*pa) && is_digit(*pb)) + pa++, pb++; + + if (is_digit(*pa)) + return 1; // pa still going, so larger + else if (is_digit(*pb)) + return -1; // pb still going, so larger + else if (diff) + return diff; // same length, so earlier diff + } + } + return *pa? 1 : *pb? -1 : 0; +} + +typedef struct { + htsFile *fp; + bam1_t * b; + bam_hdr_t * hdr; + + uint64_t file_offset; + sds queryname; + int ret; + + aln_spec_t * aln_specs; + uint16_t n_aln_cap; + uint16_t n_aln_sz; + + qidx_record_t * pending_release; +} qidx_record_iter_t; + +qidx_err_t qidx_record_iter_init(qidx_record_iter_t *it, htsFile *fp) { + it->b = bam_init1(); + it->fp = fp; + + // advance past header + it->hdr = sam_hdr_read(fp); + + if(!it->hdr) { + return QIDX_NO_BAM_HDR; + } + + it->file_offset = bgzf_tell(fp->fp.bgzf); + + if((it->ret = sam_read1(it->fp, it->hdr, it->b)) < 0) { + return QIDX_BAM_READ_FAILURE; + } + + it->queryname = sdsnew(bam_get_qname(it->b)); + it->aln_specs = NULL; + it->n_aln_sz = it->n_aln_cap = 0; + + if(bgzf_seek(fp->fp.bgzf, it->file_offset, SEEK_SET) < 0) { + return QIDX_IO_FAIL; + } + + it->pending_release = NULL; + + return QIDX_OK; +} + +static void _qidx_record_iter_free_record(qidx_record_iter_t * it) { + if(it->pending_release) { + free(it->pending_release->alns); + sdsfree(it->pending_release->qname); + it->pending_release = NULL; + } +} + +qidx_err_t qidx_record_iter_next(qidx_record_iter_t * it, qidx_record_t * rec) { + + _qidx_record_iter_free_record(it); + + sds queryname; + int cmp; + int ret; + bool yield = false; + while ((ret = sam_read1(it->fp, it->hdr, it->b)) >= 0) { + queryname = sdsnew(bam_get_qname(it->b)); + + if(!queryname) { + return QIDX_NO_MEM; + } + + if((cmp = strnum_cmp(it->queryname, queryname)) != 0) { + if(cmp > 0) { + printf("Err: the file is not sorted\n"); + return QIDX_NOT_SORTED; + } + + rec->n_alns = it->n_aln_sz; + rec->alns = it->aln_specs; + rec->qname = it->queryname; + + it->pending_release = rec; + + yield = true; + + it->queryname = sdsdup(queryname); + it->aln_specs = NULL; + it->n_aln_cap = it->n_aln_sz = 0; + } + + AM_RESIZE(it->aln_specs, it->n_aln_sz + 1, it->n_aln_cap); + + it->aln_specs[it->n_aln_sz].pos = it->b->core.pos; + it->aln_specs[it->n_aln_sz].tid = it->b->core.tid; + it->aln_specs[it->n_aln_sz].vptr = it->file_offset; + + it->n_aln_sz++; + + // update offset for next record + it->file_offset = bgzf_tell(it->fp->fp.bgzf); + + sdsfree(queryname); + + if(yield) { + break; + } + } + + if(ret < -1) { + it->pending_release = NULL; + return QIDX_BAM_READ_FAILURE; + } else if (ret == -1 && it->queryname != NULL) { + rec->n_alns = it->n_aln_sz; + rec->alns = it->aln_specs; + rec->qname = it->queryname; + + it->pending_release = rec; + + yield = true; + + it->queryname = NULL; + it->aln_specs = NULL; + it->n_aln_cap = it->n_aln_sz = 0; + } + + if(!yield) { + return QIDX_ITER_DONE; + } + + return QIDX_OK; +} + +qidx_err_t qidx_record_iter_destroy(qidx_record_iter_t * it) { + _qidx_record_iter_free_record(it); + + sdsfree(it->queryname); + + bam_hdr_destroy(it->hdr); + bam_destroy1(it->b); + + return QIDX_OK; +} + +// @abstract Structure containing summary statistics estimated using +// a one-pass algorithm +// @field mu mean +// @field s sample variance +// @field n number of entries +typedef struct { + double mu; + double s; + size_t n; +} roll_est_params_t; + +// @abstract observe a data point +void roll_est_add_evidence(roll_est_params_t * p, double x) { + double delta = x - p->mu; + p->mu += delta / (p->n + 1); + p->s += (x - p->mu) * delta; + p->n++; +} + +// @abstract finalize the estimate +void roll_est_finalize(roll_est_params_t * p) { + p->s /= p->n; +} + +// Calculates the maximum number of items placed into a single bucket +// to not exceed the bucket_size with a probability specified by the +// area to the right of the positive z-score z. E.g. at z=2 there is +// 2.4% chance of the soft limit being exceeded given objects are +// assigned uniformly to each bucket. (The cardinality of each bucket +// is the same.) +// +// In order for the maximum size to (really) not be exceeded, the +// binomial distribution of items in each bucket would need to be +// included. It is recommended to set this soft limit much lower +// than the hard limit to not exceed it. +// +// This is derived from the standard error as measured by the sampling +// distribution. + +// @param soft_limit a soft limit on the number of variable-length items +// stuffed into a single bucket +// @param z the z-score with which this will happen +// @param p the parameters of the normal distribution +size_t estimate_n_buckets(double soft_limit, double z, roll_est_params_t const * p) { + double n = sqrt(z * z * p->s + 4 * p->mu * soft_limit) - z * sqrt(p->s); + n = n / (2 * p->mu); + + int max_bucket_size = floor(n * n); + + return (size_t) ceil((double) p->n / (double) max_bucket_size); +} + +typedef struct { + uint64_t offset; + uint32_t n_bytes; +} qidx_bucket_t; + +#pragma pack(push, 1) + +typedef struct { + uint64_t offset; + uint32_t n_bytes; +} qidx_bucket_packed_t; + +#pragma pack(pop) + + +typedef struct { + uint32_t n_buckets; + uint32_t max_bucket_size; + qidx_bucket_t * buckets; + char * rb_start; +} qidx_htable_t; + +#pragma pack(push, 1) + +typedef struct { + uint32_t magic; + uint16_t version; + uint32_t n_buckets; + uint32_t max_bucket_size; +} qidx_header_t; + +#pragma pack(pop) + +struct qidx_fp { + int fd; + void * map; + size_t len; + bool read_only; + qidx_header_t * hdr; + qidx_bucket_packed_t * buckets; + qidx_htable_t * htab; +}; + +size_t qidx_file_size(uint32_t n_buckets, uint32_t max_bucket_size) { + size_t sz = 0; + + sz += sizeof(qidx_header_t); + sz += (uint64_t) sizeof(qidx_bucket_packed_t) * (uint64_t) n_buckets; + sz += (uint64_t) n_buckets * (uint64_t) max_bucket_size; + + return sz; +} + +// @abstract initialize htab, must first populate fp->buckets +qidx_err_t _qidx_init_htab(qidx_fp_t * fp, + uint32_t n_buckets, uint32_t max_bucket_size) { + qidx_htable_t * htab = malloc(sizeof(qidx_htable_t)); + if(!htab) { + return QIDX_NO_MEM; + } + + htab->buckets = calloc(sizeof(qidx_bucket_t), n_buckets); + if(!htab->buckets) { + free(htab); + return QIDX_NO_MEM; + } + + htab->n_buckets = n_buckets; + htab->max_bucket_size = max_bucket_size; + + htab->rb_start = (void *) (fp->buckets + n_buckets); + + fp->htab = htab; + + return QIDX_OK; +} + + +qidx_err_t qidx_open(qidx_fp_t ** _fp, int fd) { + qidx_fp_t * fp = malloc(sizeof(qidx_fp_t)); + *_fp = fp; + + fp->fd = fd; + + qidx_header_t hdr; + if(read(fd, &hdr, sizeof(qidx_header_t)) == -1) { + return QIDX_FAILED_TO_OPEN_INDEX_FILE; + } + + if(le32toh(hdr.magic) != QIDX_MAGIC) { + return QIDX_INVALID_MAGIC; + } + + if(le16toh(hdr.version) != QIDX_VERSION) { + return QIDX_INVALID_VERSION; + } + + uint32_t max_bucket_size = le32toh(hdr.max_bucket_size); + uint32_t n_buckets = le32toh(hdr.n_buckets); + + size_t sz = qidx_file_size(n_buckets, max_bucket_size); + + if(lseek(fd, 0, SEEK_SET) == -1) { + return QIDX_IO_FAIL; + } + + fp->read_only = true; + fp->map = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); + fp->len = sz; + + if(fp->map == MAP_FAILED) { + return QIDX_MAP_FAIL; + } + + fp->hdr = fp->map; + fp->buckets = (qidx_bucket_packed_t *)(fp->hdr + 1); + + qidx_err_t err = _qidx_init_htab(fp, n_buckets, max_bucket_size); + if(err) { return err; } + + for(size_t i = 0; i < n_buckets; i++) { + fp->htab->buckets[i].n_bytes = le32toh(fp->buckets[i].n_bytes); + fp->htab->buckets[i].offset = le64toh(fp->buckets[i].offset); + } + + return QIDX_OK; +} + +qidx_err_t qidx_create(qidx_fp_t ** _fp, int fd, uint32_t _n_buckets, uint32_t max_bucket_size) { + qidx_fp_t * fp = malloc(sizeof(qidx_fp_t)); + *_fp = fp; + + uint32_t n_buckets = next_prime_above_lowerbound(_n_buckets); + + size_t sz = qidx_file_size(n_buckets, max_bucket_size); + + fp->fd = fd; + + if(ftruncate(fd, sz) == -1) { + return QIDX_IO_FAIL; + } + + fp->read_only = false; + fp->map = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + fp->len = sz; + + if(fp->map == MAP_FAILED) { + return QIDX_MAP_FAIL; + } + + fp->hdr = fp->map; + + fp->hdr->magic = htole32(QIDX_MAGIC); + fp->hdr->version = htole16(QIDX_VERSION); + fp->hdr->n_buckets = htole32(n_buckets); + fp->hdr->max_bucket_size = htole32(max_bucket_size); + + fp->buckets = (qidx_bucket_packed_t *) (fp->hdr + 1); + + _qidx_init_htab(fp, n_buckets, max_bucket_size); + + uint64_t off = 0; + for(size_t i = 0; i < n_buckets; i++) { + fp->htab->buckets[i].offset = off; + fp->htab->buckets[i].n_bytes = 0; + + off += max_bucket_size; + } + + // REVISIT + for(size_t i = n_buckets - 1; i > 0; i--) { + fp->htab->rb_start[fp->htab->buckets[i].offset] = '\1'; + } + + return QIDX_OK; +} + +qidx_err_t qidx_close(qidx_fp_t * fp) { + + qidx_htable_t * htab = fp->htab; + + if(!fp->read_only) { + for(size_t i = 0; i < htab->n_buckets; i++) { + fp->buckets[i].n_bytes = htole32(htab->buckets[i].n_bytes); + fp->buckets[i].offset = htole64(htab->buckets[i].offset); + } + } + + free(htab->buckets); + free(htab); + + if(munmap(fp->map, fp->len) != -1) { + return QIDX_MAP_FAIL; + } + + free(fp); + + return QIDX_OK; +} + +qidx_htable_t * qidx_get_htable(qidx_fp_t * fp) { return fp->htab; } + +static qidx_bucket_t * _qidx_bucket(char const * qname, qidx_htable_t const * tab) { + size_t hash = murmur_hash_str_64s(qname, 0x12345); + + qidx_bucket_t * bucket = &tab->buckets[hash % (size_t)(tab->n_buckets)]; + + return bucket; +} + +qidx_err_t qidx_table_add(qidx_htable_t * tab, qidx_record_t const * rec) { + qidx_bucket_t * bucket = _qidx_bucket(rec->qname, tab); + + void * ptr = (char *) tab->rb_start + bucket->offset + bucket->n_bytes; + size_t rem = tab->max_bucket_size - bucket->n_bytes; + + ssize_t wrlen = qidx_record_marital(rec, ptr, rem); + + if(wrlen < 0) { + return QIDX_BUCKET_MAX_SIZE_EXCEEDED; + } + + bucket->n_bytes += wrlen; + + return QIDX_OK; +} + +qidx_err_t qidx_table_get(qidx_htable_t const * tab, + char const * qname, qidx_record_t ** rec) { + qidx_bucket_t * bucket = _qidx_bucket(qname, tab); + + void const * bucket_start = tab->rb_start + bucket->offset; + void const * record_start = + qidx_record_martial_find(bucket_start, bucket->n_bytes, qname); + + if(!record_start) { + *rec = NULL; + } else { + size_t rem = bucket->n_bytes - + ((char const *) record_start - (char const *) bucket_start); + *rec = malloc(sizeof(qidx_record_t)); + + if(!*rec) { return QIDX_NO_MEM; } + + void const * next = qidx_record_unmartial(*rec, record_start, rem); + + if(!next) { + return QIDX_IO_FAIL; + } + } + + return QIDX_OK; +} + +void qidx_record_print(FILE * file, qidx_record_t * rec) { + fprintf(file, "qname: %s\n", rec->qname); + for(size_t i = 0; i < rec->n_alns; i++) { + fprintf(file, "\tTID: %u\n", rec->alns[i].tid); + fprintf(file, "\tPOS: %u\n", rec->alns[i].pos); + fprintf(file, "\tVPTR: %lu\n", rec->alns[i].vptr); + } +} + +qidx_err_t qidx_estimate_buckets(htsFile * fp, + double soft_limit, double z, uint32_t * n_buckets) { + + // Pass 1: + // - estimate mu, s, n of the distribution of record sizes + // - estimate the number of buckets needed in the hash table + qidx_record_iter_t it; + qidx_err_t err; + if((err = qidx_record_iter_init(&it, fp))) { + return err; + } + + roll_est_params_t params; + memset(¶ms, 0, sizeof(roll_est_params_t)); + qidx_record_t rec; + while(!(err = qidx_record_iter_next(&it, &rec))) { + double sz = (double) qidx_record_msize(&rec); + roll_est_add_evidence(¶ms, sz); + } + if(err != QIDX_ITER_DONE) return err; + + if ((err = qidx_record_iter_destroy(&it))) { + return err; + } + + roll_est_finalize(¶ms); + + *n_buckets = estimate_n_buckets(soft_limit, z, ¶ms); + + printf("record size (mean): %f\n", params.mu); + printf("record size (variance): %f\n", params.s); + printf("number of records: %zu\n", params.n); + printf("number of buckets: %u\n", *n_buckets); + + return QIDX_OK; +} + +qidx_err_t qidx_create_index3(htsFile * fp, int fd, + qidx_fp_t ** _q_fp, uint32_t n_buckets, + uint32_t max_bucket_size) { + qidx_err_t err; + + if((err = qidx_create(_q_fp, fd, n_buckets, max_bucket_size))) { + return err; + } + + qidx_fp_t * q_fp = *_q_fp; + + qidx_htable_t * htab = qidx_get_htable(q_fp); + + qidx_record_iter_t it; + if ((err = qidx_record_iter_init(&it, fp))) { + return err; + } + + qidx_record_t rec; + while((err = qidx_record_iter_next(&it, &rec)) == QIDX_OK) { + qidx_table_add(htab, &rec); + } + if(err != QIDX_ITER_DONE) return err; + + if ((err = qidx_record_iter_destroy(&it))) { + return err; + } + + return QIDX_OK; +} + +qidx_err_t qidx_create_index2(htsFile * fp, int fd, qidx_fp_t ** q_fp, + double soft_limit, double z, uint32_t n_buckets, + uint32_t max_bucket_size) { + qidx_err_t err; + + if(n_buckets == 0) { + + err = qidx_estimate_buckets(fp, soft_limit, z, &n_buckets); + + if(err) { + return err; + } + + // Rewind to start of BAM file + if (-1 == bgzf_seek(fp->fp.bgzf, 0, SEEK_SET)) { + return QIDX_IO_FAIL; + } + + } + + if((err = qidx_create_index3(fp, fd, q_fp, n_buckets, max_bucket_size))) { + return err; + } + + return QIDX_OK; +} + +qidx_err_t qidx_create_index1(char const * bamfile, char const * indexfile, + uint32_t n_buckets, uint32_t max_bucket_size) { + htsFile *b_fp = hts_open(bamfile, "r"); + if(b_fp == NULL) { return QIDX_FAILED_TO_OPEN_BAMFILE; } + + FILE * qidx_file = fopen(indexfile, "wb+"); + if(!qidx_file) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } + int fd = fileno(qidx_file); + + double soft_limit = 16 * 1024; + double z = 5; + + qidx_fp_t * q_fp; + qidx_err_t err; + if((err = qidx_create_index2(b_fp, fd, &q_fp, + soft_limit, z, n_buckets, max_bucket_size))) { + return err; + } + + qidx_close(q_fp); + fclose(qidx_file); + hts_close(b_fp); + + return QIDX_OK; +} + +qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile) { + uint32_t max_bucket_size = 1 << 19; + return qidx_create_index1(bamfile, indexfile, 0, max_bucket_size); +} + +qidx_err_t qidx_search_index(char const * indexfile, + char const * bamfile, char const * qname) { + FILE * index = fopen(indexfile, "rb"); + if(!index) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } + + int fd = fileno(index); + + qidx_fp_t * q_fp; + qidx_err_t err = qidx_open(&q_fp, fd); + if(err) return err; + + qidx_htable_t * htab = qidx_get_htable(q_fp); + + qidx_record_t * rec; + if((err = qidx_table_get(htab, qname, &rec))) { + return err; + } + + if(rec) { + htsFile * fp = hts_open(bamfile, "r"); + htsFile* out_fp = hts_open("-", "w"); + + + if(!fp) { + return QIDX_FAILED_TO_OPEN_BAMFILE; + } + + bam1_t * b = bam_init1(); + bam_hdr_t * hdr = sam_hdr_read(fp); + for(size_t i = 0; i < rec->n_alns; i++) { + aln_spec_t * aln = &rec->alns[i]; + + if(bgzf_seek(fp->fp.bgzf, aln->vptr, SEEK_SET) < 0) { + return QIDX_IO_FAIL; + } + + int ret; + if((ret = sam_read1(fp, hdr, b)) >= 0) { + + int ret = sam_write1(out_fp, hdr, b); + if(ret < 0) { + return QIDX_IO_FAIL; + } + } + } + + sdsfree(rec->qname); + free(rec->alns); + free(rec); + + bam_hdr_destroy(hdr); + bam_destroy1(b); + hts_close(out_fp); + hts_close(fp); + } else { + fprintf(stderr, "Could not find read: %s\n", qname); + } + + if((err = qidx_close(q_fp))) { + return err; + } + + return QIDX_OK; +} + diff --git a/qidx.h b/qidx.h new file mode 100644 index 0000000..fdc1046 --- /dev/null +++ b/qidx.h @@ -0,0 +1,70 @@ +#ifndef _QIDX +#define _QIDX + +#include +#include + +// @abstract Structure representing an unique alignment and virtual file pointer +// @field tid chromosome ID +// @field pos 0-based leftmost coordinate +// @field vptr virtual pointer +typedef struct { + uint32_t tid; + uint32_t pos; + uint64_t vptr; +} aln_spec_t; + +// @abstract Structure representing all alignments by queryname +// @field qname queryname (name for the given read) +// @field n_alns number of alignments +// @field alns array with alignments structures +typedef struct { + char * qname; + uint16_t n_alns; + aln_spec_t * alns; +} qidx_record_t; + +typedef enum { + QIDX_OK = 0, + QIDX_ITER_DONE, + QIDX_NO_MEM, + QIDX_NO_BAM_HDR, + QIDX_BAM_READ_FAILURE, + QIDX_REC_ITER_FAILURE, + QIDX_NOT_SORTED, + QIDX_BUCKET_MAX_SIZE_EXCEEDED, + QIDX_MAP_FAIL, + QIDX_IO_FAIL, + QIDX_FAILED_TO_OPEN_BAMFILE, + QIDX_FAILED_TO_OPEN_INDEX_FILE, + QIDX_INVALID_VERSION, + QIDX_INVALID_MAGIC, +} qidx_err_t; + +bool qidx_errno(qidx_err_t err); + +char const * qidx_strerr(qidx_err_t err); + +#define QIDX_MAGIC 0x5d1de6b4 +#define QIDX_VERSION 1 + +struct qidx_fp; +typedef struct qidx_fp qidx_fp_t; + +qidx_err_t qidx_open(qidx_fp_t ** fp, int fd); +qidx_err_t qidx_create(qidx_fp_t ** fp, int fd, uint32_t _n_buckets, + uint32_t max_bucket_size); +qidx_err_t qidx_close(qidx_fp_t * fp); + +qidx_err_t qidx_create_index3(htsFile * fp, int fd, qidx_fp_t ** q_fp, + uint32_t n_buckets, uint32_t max_bucket_size); +qidx_err_t qidx_create_index2(htsFile * fp, int fd, qidx_fp_t ** q_fp, + double soft_limit, double z, uint32_t n_buckets, uint32_t max_bucket_size); +qidx_err_t qidx_create_index1(char const * bamfile, char const * indexfile, uint32_t n_buckets, uint32_t max_bucket_size); +qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile); + +// DO NOT USE +qidx_err_t qidx_search_index(char const * indexfile, + char const * bamfile, char const * qname); + +#endif /* ifndef _QIDX */ -- cgit v1.2.3