#include #include #include #include #include #include #include #include #include "util/sds/sds.h" #include "util/prime_search.h" #include "util/hash.h" #include "util/util.h" #include // @abstract Structure representing an unique alignment and virtual file pointer // @field tid chromosome ID // @field pos 0-based leftmost coordinate // @field vptr virtual pointer typedef struct { uint32_t tid; uint32_t pos; uint64_t vptr; } aln_spec_t; #pragma pack(push, 1) typedef struct { uint32_t tid; uint32_t pos; uint64_t vptr; } aln_spec_packed_t; #pragma pack(pop) // constrains (qname < 2 ** 32) // @abstract Structure representing all alignments by queryname // @field qname queryname (name for the given read) // @field n_alns number of alignments // @field alns array with alignments structures typedef struct { char * qname; uint16_t n_alns; aln_spec_t * alns; } qidx_record_t; // @abstract Get the marshaled size of a record static size_t qidx_record_msize(qidx_record_t const * q) { return ( strlen(q->qname) + 1 + sizeof(uint16_t) + sizeof(aln_spec_packed_t) * q->n_alns ); } // @abstract Martial a record to a fixed buffer // // @param q query index record // @param dst destination buffer // @param dstlen length of the destination buffer // // @return returns the number of bytes written to the buffer // or < 0 if the destination buffer is exceeded ssize_t qidx_record_marital(qidx_record_t const * q, void * dst, size_t dstlen) { size_t rem = dstlen; size_t n = strlen(q->qname) + 1; if(rem < n) return -1; rem -= n; { char * _dst = dst; strcpy(_dst, q->qname); dst = (_dst + n); } size_t sz = 0; sz += sizeof(uint16_t); sz += sizeof(aln_spec_packed_t) * q->n_alns; if(rem < sz) return -1; rem -= sz; { uint16_t * _dst = dst; *_dst = htole16(q->n_alns); dst = _dst + 1; } aln_spec_packed_t * buf_aln = dst; for(size_t i = 0; i < q->n_alns; i++) { aln_spec_t * aln = &q->alns[i]; buf_aln->tid = htole32(aln->tid); buf_aln->pos = htole32(aln->pos); buf_aln->vptr = htole64(aln->vptr); buf_aln++; } return dstlen - rem; } // @abstract Helper to parse the query name and the number of alignments // @returns NULL if a parsing error occured else the location after the buffer static void const * _parse_record_header(void const * buf, size_t buflen, char const ** qname, uint16_t * n_alns) { size_t n = strnlen(buf, buflen) + 1; *qname = buf; if(n + sizeof(uint16_t) > buflen) return NULL; uint16_t const * _n_alns = (uint16_t const *) (*qname + n); *n_alns = le64toh(*_n_alns); buf = _n_alns + 1; if(sizeof(aln_spec_packed_t) * *n_alns > buflen) return NULL; return buf; } // @abstract Unmartial a query record. The resulting memory needs to be freed. // All strings are sds strings. // // @param q query index record // @param buf source buffer // @param buflen source buffer length // // @return a pointer within buf after the record // or NULL if a parsing error occurred void const * qidx_record_unmartial(qidx_record_t * q, void const * buf, size_t buflen) { char const * qname; if(!(buf = _parse_record_header(buf, buflen, &qname, &q->n_alns))) { return NULL; } q->qname = sdsnew(qname); q->alns = calloc(q->n_alns, sizeof(aln_spec_t)); aln_spec_packed_t const * buf_aln = buf; for(size_t i = 0; i < q->n_alns; i++) { aln_spec_t * aln = &q->alns[i]; aln->pos = le32toh(buf_aln->pos); aln->tid = le32toh(buf_aln->tid); aln->vptr = le64toh(buf_aln->vptr); buf_aln++; } return buf_aln; } // @abstract Find a record efficiently in a marshaled buffer of query index records // // @param buf source buffer // @param buflen length of the source buffer // @param qname target queryname // // @returns a pointer to a target query or NULL if it a record with the target query name // does not exist in the buffer void const * qidx_record_martial_find(void const * buf, size_t buflen, char const * qname) { size_t n = strnlen(buf, buflen); char const * _qname; uint16_t n_alns; void const * cur = buf, * next = NULL; while((next = _parse_record_header(cur, buflen, &_qname, &n_alns))) { if(strcmp(_qname, qname) == 0) { return cur; } next = (aln_spec_packed_t *) next + n_alns; size_t l = (char const *) next - (char const *) cur; buflen -= l; cur = next; if(!buflen) { return NULL; } } // Invalid return NULL; } typedef enum { QIDX_OK = 0, QIDX_ITER_DONE, QIDX_NO_MEM, QIDX_NO_BAM_HDR, QIDX_BAM_READ_FAILURE, QIDX_REC_ITER_FAILURE, QIDX_NOT_SORTED, QIDX_BUCKET_MAX_SIZE_EXCEEDED, QIDX_MAP_FAIL, QIDX_IO_FAIL, QIDX_FAILED_TO_OPEN_BAMFILE, QIDX_FAILED_TO_OPEN_INDEX_FILE, QIDX_INVALID_VERSION, QIDX_INVALID_MAGIC, } qidx_err_t; bool qidx_errno(qidx_err_t err) { switch(err) { case QIDX_MAP_FAIL: case QIDX_IO_FAIL: return true; default: return false; } } char const * qidx_strerr(qidx_err_t err) { switch(err) { case QIDX_NO_MEM: return "memory was exausted"; case QIDX_NO_BAM_HDR: return "parsing BAM header failed"; case QIDX_BAM_READ_FAILURE: return "failed to read bamfile"; case QIDX_REC_ITER_FAILURE: return "bam query index record iterator failed"; case QIDX_NOT_SORTED: return "input bam file not sorted by query name"; case QIDX_BUCKET_MAX_SIZE_EXCEEDED: return "maximum query size exceeded"; case QIDX_MAP_FAIL: return "mmap failed"; case QIDX_IO_FAIL: return "qidx io failure"; case QIDX_FAILED_TO_OPEN_BAMFILE: return "failed to open bamfile"; case QIDX_FAILED_TO_OPEN_INDEX_FILE: return "failed to open indexfile"; case QIDX_INVALID_MAGIC: return "invalid magic number (not a qidx or corrupted)"; case QIDX_INVALID_VERSION: return "qidx version outdated, please regenerate"; case QIDX_ITER_DONE: case QIDX_OK: return "OK"; } } qidx_err_t init_qidx(qidx_record_t * r) { return QIDX_OK; } void qidx_free(qidx_record_t * r) { free(r); } /* * SAMTOOLS SORTING ALGORITHM */ // strnum_cmp is the string comparator used by samtools sort // to sort bam files by read names (e.g. QNAME) // // The input file is expected to be ordered accordingly. This // is verified at runtime. #define is_digit(c) ((c)<='9' && (c)>='0') static int strnum_cmp(const char *_a, const char *_b) { const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; const unsigned char *pa = a, *pb = b; while (*pa && *pb) { if (!is_digit(*pa) || !is_digit(*pb)) { if (*pa != *pb) return (int)*pa - (int)*pb; ++pa; ++pb; } else { // skip leading zeros while (*pa == '0') ++pa; while (*pb == '0') ++pb; // skip matching digits while (is_digit(*pa) && *pa == *pb) pa++, pb++; // Now mismatching, so see which ends the number sooner int diff = (int)*pa - (int)*pb; while (is_digit(*pa) && is_digit(*pb)) pa++, pb++; if (is_digit(*pa)) return 1; // pa still going, so larger else if (is_digit(*pb)) return -1; // pb still going, so larger else if (diff) return diff; // same length, so earlier diff } } return *pa? 1 : *pb? -1 : 0; } typedef struct { htsFile *fp; bam1_t * b; bam_hdr_t * hdr; uint64_t file_offset; sds queryname; int ret; aln_spec_t * aln_specs; uint16_t n_aln_cap; uint16_t n_aln_sz; qidx_record_t * pending_release; } qidx_record_iter_t; qidx_err_t qidx_record_iter_init(qidx_record_iter_t *it, htsFile *fp) { it->b = bam_init1(); it->fp = fp; // advance past header it->hdr = sam_hdr_read(fp); if(!it->hdr) { return QIDX_NO_BAM_HDR; } it->file_offset = bgzf_tell(fp->fp.bgzf); if((it->ret = sam_read1(it->fp, it->hdr, it->b)) < 0) { return QIDX_BAM_READ_FAILURE; } it->queryname = sdsnew(bam_get_qname(it->b)); it->aln_specs = NULL; it->n_aln_sz = it->n_aln_cap = 0; if(bgzf_seek(fp->fp.bgzf, it->file_offset, SEEK_SET) < 0) { return QIDX_IO_FAIL; } it->pending_release = NULL; return QIDX_OK; } static void _qidx_record_iter_free_record(qidx_record_iter_t * it) { if(it->pending_release) { free(it->pending_release->alns); sdsfree(it->pending_release->qname); it->pending_release = NULL; } } qidx_err_t qidx_record_iter_next(qidx_record_iter_t * it, qidx_record_t * rec) { _qidx_record_iter_free_record(it); sds queryname; int cmp; int ret; bool yield = false; while ((ret = sam_read1(it->fp, it->hdr, it->b)) >= 0) { queryname = sdsnew(bam_get_qname(it->b)); if(!queryname) { return QIDX_NO_MEM; } if((cmp = strnum_cmp(it->queryname, queryname)) != 0) { if(cmp > 0) { printf("Err: the file is not sorted\n"); return QIDX_NOT_SORTED; } rec->n_alns = it->n_aln_sz; rec->alns = it->aln_specs; rec->qname = it->queryname; it->pending_release = rec; yield = true; it->queryname = sdsdup(queryname); it->aln_specs = NULL; it->n_aln_cap = it->n_aln_sz = 0; } AM_RESIZE(it->aln_specs, it->n_aln_sz + 1, it->n_aln_cap); it->aln_specs[it->n_aln_sz].pos = it->b->core.pos; it->aln_specs[it->n_aln_sz].tid = it->b->core.tid; it->aln_specs[it->n_aln_sz].vptr = it->file_offset; it->n_aln_sz++; // update offset for next record it->file_offset = bgzf_tell(it->fp->fp.bgzf); sdsfree(queryname); if(yield) { break; } } if(ret < -1) { it->pending_release = NULL; return QIDX_BAM_READ_FAILURE; } else if (ret == -1 && it->queryname != NULL) { rec->n_alns = it->n_aln_sz; rec->alns = it->aln_specs; rec->qname = it->queryname; it->pending_release = rec; yield = true; it->queryname = NULL; it->aln_specs = NULL; it->n_aln_cap = it->n_aln_sz = 0; } if(!yield) { return QIDX_ITER_DONE; } return QIDX_OK; } qidx_err_t qidx_record_iter_destroy(qidx_record_iter_t * it) { _qidx_record_iter_free_record(it); sdsfree(it->queryname); bam_hdr_destroy(it->hdr); bam_destroy1(it->b); return QIDX_OK; } #include // @abstract Structure containing summary statistics estimated using // a one-pass algorithm // @field mu mean // @field s sample variance // @field n number of entries typedef struct { double mu; double s; size_t n; } roll_est_params_t; // @abstract observe a data point void roll_est_add_evidence(roll_est_params_t * p, double x) { double delta = x - p->mu; p->mu += delta / (p->n + 1); p->s += (x - p->mu) * delta; p->n++; } // @abstract finalize the estimate void roll_est_finalize(roll_est_params_t * p) { p->s /= p->n; } // Calculates the maximum number of items placed into a single bucket // to not exceed the bucket_size with a probability specified by the // area to the right of the positive z-score z. E.g. at z=2 there is // 2.4% chance of the soft limit being exceeded given objects are // assigned uniformly to each bucket. (The cardinality of each bucket // is the same.) // // In order for the maximum size to (really) not be exceeded, the // binomial distribution of items in each bucket would need to be // included. It is recommended to set this soft limit much lower // than the hard limit to not exceed it. // // This is derived from the standard error as measured by the sampling // distribution. // @param soft_limit a soft limit on the number of variable-length items // stuffed into a single bucket // @param z the z-score with which this will happen // @param p the parameters of the normal distribution size_t estimate_n_buckets(double soft_limit, double z, roll_est_params_t const * p) { double n = sqrt(z * z * p->s + 4 * p->mu * soft_limit) - z * sqrt(p->s); n = n / (2 * p->mu); int max_bucket_size = floor(n * n); return (size_t) ceil((double) p->n / (double) max_bucket_size); } typedef struct { uint64_t offset; uint32_t n_bytes; } qidx_bucket_t; typedef struct { uint32_t n_buckets; uint32_t max_bucket_size; qidx_bucket_t * buckets; char * rb_start; } qidx_htable_t; #define QIDX_MAGIC 0x5d1de6b4 #define QIDX_VERSION 1 #pragma pack(push, 1) typedef struct { uint32_t magic; uint16_t version; uint32_t n_buckets; uint32_t max_bucket_size; } qidx_header_t; #pragma pack(pop) typedef struct { int fd; void * map; size_t len; bool read_only; qidx_header_t * hdr; qidx_bucket_t * buckets; qidx_htable_t * htab; } qidx_fp_t; size_t qidx_file_size(uint32_t n_buckets, uint32_t max_bucket_size) { size_t sz = 0; sz += sizeof(qidx_header_t); sz += sizeof(qidx_htable_t) * n_buckets; sz += (uint64_t) n_buckets * (uint64_t) max_bucket_size; return sz; } #include #include // @abstract initialize htab, must first populate fp->buckets qidx_err_t _qidx_init_htab(qidx_fp_t * fp, uint32_t n_buckets, uint32_t max_bucket_size) { qidx_htable_t * htab = malloc(sizeof(qidx_htable_t)); if(!htab) { return QIDX_NO_MEM; } htab->buckets = calloc(sizeof(qidx_bucket_t), n_buckets); if(!htab->buckets) { free(htab); return QIDX_NO_MEM; } htab->n_buckets = n_buckets; htab->max_bucket_size = max_bucket_size; htab->rb_start = (void *) (fp->buckets + n_buckets); fp->htab = htab; return QIDX_OK; } qidx_err_t qidx_open(qidx_fp_t * fp, int fd) { fp->fd = fd; qidx_header_t hdr; if(read(fd, &hdr, sizeof(qidx_header_t)) == -1) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } if(le32toh(hdr.magic) != QIDX_MAGIC) { return QIDX_INVALID_MAGIC; } if(le16toh(hdr.version) != QIDX_VERSION) { return QIDX_INVALID_VERSION; } uint32_t max_bucket_size = le32toh(hdr.max_bucket_size); uint32_t n_buckets = le32toh(hdr.n_buckets); size_t sz = qidx_file_size(n_buckets, max_bucket_size); if(lseek(fd, 0, SEEK_SET) == -1) { return QIDX_IO_FAIL; } fp->read_only = true; fp->map = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); fp->len = sz; if(fp->map == MAP_FAILED) { return QIDX_MAP_FAIL; } fp->hdr = fp->map; fp->buckets = (qidx_bucket_t *)(fp->hdr + 1); qidx_err_t err = _qidx_init_htab(fp, n_buckets, max_bucket_size); if(err) { return err; } for(size_t i = 0; i < n_buckets; i++) { fp->htab->buckets[i].n_bytes = le32toh(fp->buckets[i].n_bytes); fp->htab->buckets[i].offset = le64toh(fp->buckets[i].offset); } return QIDX_OK; } qidx_err_t qidx_create(qidx_fp_t * fp, int fd, uint32_t _n_buckets, uint32_t max_bucket_size) { uint32_t n_buckets = next_prime_above_lowerbound(_n_buckets); size_t sz = qidx_file_size(n_buckets, max_bucket_size); fp->fd = fd; if(ftruncate(fd, sz) == -1) { return QIDX_IO_FAIL; } fp->read_only = false; fp->map = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); fp->len = sz; if(fp->map == MAP_FAILED) { return QIDX_MAP_FAIL; } fp->hdr = fp->map; fp->hdr->magic = htole32(QIDX_MAGIC); fp->hdr->version = htole16(QIDX_VERSION); fp->hdr->n_buckets = htole32(n_buckets); fp->hdr->max_bucket_size = htole32(max_bucket_size); fp->buckets = (qidx_bucket_t *) (fp->hdr + 1); _qidx_init_htab(fp, n_buckets, max_bucket_size); uint64_t off = (char *) fp->htab->rb_start - (char *) fp->map; for(size_t i = 0; i < n_buckets; i++) { fp->htab->buckets[i].offset = off; fp->htab->buckets[i].n_bytes = 0; off += max_bucket_size; } return QIDX_OK; } qidx_err_t qidx_close(qidx_fp_t * fp) { qidx_htable_t * htab = fp->htab; if(!fp->read_only) { for(size_t i = 0; i < htab->n_buckets; i++) { fp->buckets[i].n_bytes = htole32(htab->buckets[i].n_bytes); fp->buckets[i].offset = htole64(htab->buckets[i].offset); } } free(htab->buckets); free(htab); if(munmap(fp->map, fp->len) != -1) { return QIDX_MAP_FAIL; } return QIDX_OK; } qidx_htable_t * qidx_get_htable(qidx_fp_t * fp) { return fp->htab; } static qidx_bucket_t * _qidx_bucket(char const * qname, qidx_htable_t const * tab) { size_t hash = murmur_hash_str_64s(qname, 0x12345); qidx_bucket_t * bucket = &tab->buckets[hash % tab->n_buckets]; return bucket; } qidx_err_t qidx_table_add(qidx_htable_t * tab, qidx_record_t const * rec) { qidx_bucket_t * bucket = _qidx_bucket(rec->qname, tab); void * ptr = (char *) tab->rb_start + bucket->offset + bucket->n_bytes; size_t rem = tab->max_bucket_size - bucket->n_bytes; ssize_t wrlen = qidx_record_marital(rec, ptr, rem); if(wrlen < 0) { return QIDX_BUCKET_MAX_SIZE_EXCEEDED; } bucket->n_bytes += wrlen; return QIDX_OK; } qidx_err_t qidx_table_get(qidx_htable_t const * tab, char const * qname, qidx_record_t ** rec) { qidx_bucket_t * bucket = _qidx_bucket(qname, tab); void const * bucket_start = tab->rb_start + bucket->offset; void const * record_start = qidx_record_martial_find(bucket_start, bucket->n_bytes, qname); if(!record_start) { *rec = NULL; } else { size_t rem = bucket->n_bytes - ((char const *) record_start - (char const *) bucket_start); *rec = malloc(sizeof(qidx_record_t)); if(!*rec) { return QIDX_NO_MEM; } void const * next = qidx_record_unmartial(*rec, record_start, rem); if(!next) { return QIDX_IO_FAIL; } } return QIDX_OK; } void qidx_die_err(qidx_err_t err) { char const * errstr = qidx_strerr(err); if(qidx_errno(err)) { die_errno("err: %s", errstr); } else { die("err: %s", errstr); } } void qidx_record_print(FILE * file, qidx_record_t * rec) { fprintf(file, "qname: %s\n", rec->qname); for(size_t i = 0; i < rec->n_alns; i++) { fprintf(file, "\tTID: %u\n", rec->alns[i].tid); fprintf(file, "\tPOS: %u\n", rec->alns[i].pos); fprintf(file, "\tVPTR: %lu\n", rec->alns[i].vptr); } } qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile) { qidx_err_t err; htsFile *b_fp = hts_open(bamfile, "r"); if(b_fp == NULL) { return QIDX_FAILED_TO_OPEN_BAMFILE; } // Pass 1: // - estimate mu, s, n of the distribution of record sizes // - estimate the number of buckets needed in the hash table qidx_record_iter_t it; if((err = qidx_record_iter_init(&it, b_fp))) { return err; } roll_est_params_t params; memset(¶ms, 0, sizeof(roll_est_params_t)); qidx_record_t rec; while(!(err = qidx_record_iter_next(&it, &rec))) { double sz = (double) qidx_record_msize(&rec); roll_est_add_evidence(¶ms, sz); } if(err != QIDX_ITER_DONE) return err; if ((err = qidx_record_iter_destroy(&it))) { return err; } roll_est_finalize(¶ms); double soft_limit = 8 * 1024; double z = 5; uint32_t n_buckets = estimate_n_buckets(soft_limit, z, ¶ms); uint32_t max_bucket_size = 1 << 24; printf("record size (mean): %f\n", params.mu); printf("record size (variance): %f\n", params.s); printf("number of records: %zu\n", params.n); printf("number of buckets: %u\n", n_buckets); FILE * qidx = fopen(indexfile, "wb+"); if(!qidx) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } int fd = fileno(qidx); // PASS 2: // - add records to index // Rewind to start of BAM file if (-1 == bgzf_seek(b_fp->fp.bgzf, 0, SEEK_SET)) { return QIDX_IO_FAIL; } qidx_fp_t q_fp; if((err = qidx_create(&q_fp, fd, n_buckets, max_bucket_size))) { qidx_die_err(err); } qidx_htable_t * htab = qidx_get_htable(&q_fp); if ((err = qidx_record_iter_init(&it, b_fp))) { return err; } while((err = qidx_record_iter_next(&it, &rec)) == QIDX_OK) { qidx_table_add(htab, &rec); } if(err != QIDX_ITER_DONE) return err; if ((err = qidx_record_iter_destroy(&it))) { return err; } qidx_close(&q_fp); fclose(qidx); hts_close(b_fp); return QIDX_OK; } qidx_err_t qidx_search_index(char const * indexfile, char const * bamfile, char const * qname) { FILE * index = fopen(indexfile, "rb"); if(!index) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; } int fd = fileno(index); qidx_fp_t q_fp; qidx_err_t err = qidx_open(&q_fp, fd); if(err) return err; qidx_htable_t * htab = qidx_get_htable(&q_fp); qidx_record_t * rec; if((err = qidx_table_get(htab, qname, &rec))) { return err; } if(rec) { htsFile * fp = hts_open(bamfile, "r"); htsFile* out_fp = hts_open("-", "w"); if(!fp) { return QIDX_FAILED_TO_OPEN_BAMFILE; } bam1_t * b = bam_init1(); bam_hdr_t * hdr = sam_hdr_read(fp); for(size_t i = 0; i < rec->n_alns; i++) { aln_spec_t * aln = &rec->alns[i]; if(bgzf_seek(fp->fp.bgzf, aln->vptr, SEEK_SET) < 0) { return QIDX_IO_FAIL; } int ret; if((ret = sam_read1(fp, hdr, b)) >= 0) { int ret = sam_write1(out_fp, hdr, b); if(ret < 0) { return QIDX_IO_FAIL; } } } sdsfree(rec->qname); free(rec->alns); free(rec); bam_hdr_destroy(hdr); bam_destroy1(b); hts_close(out_fp); hts_close(fp); } else { fprintf(stderr, "Could not find read: %s\n", qname); } return QIDX_OK; } void die_usage(char const * prog) { err("%s {create,search}", prog); err(" create [bamfile] [indexfile]"); die(" search [index] [read]"); } int main(int argc, char ** argv) { if(!argc) return 1; if(argc < 3) die_usage(argv[0]); char const * type = argv[1]; qidx_err_t err; if(strcmp(type, "create") == 0) { if(argc != 4) die_usage(argv[0]); char const * bamfile = argv[2]; char const * indexfile = argv[3]; err = qidx_create_index(bamfile, indexfile); if(err) { qidx_die_err(err); } } else if (strcmp(type, "search") == 0) { if(argc != 5) die_usage(argv[0]); char const * indexfile = argv[2]; char const * bamfile = argv[3]; char const * qname = argv[4]; if((err = qidx_search_index(indexfile, bamfile, qname))) { qidx_die_err(err); } } else { die_usage(argv[0]); } }