aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--main.c925
-rw-r--r--makefile43
-rw-r--r--qidx.c899
-rw-r--r--qidx.h70
4 files changed, 1049 insertions, 888 deletions
diff --git a/main.c b/main.c
index aa2f3d6..c56a951 100644
--- a/main.c
+++ b/main.c
@@ -1,735 +1,9 @@
#include <stdio.h>
#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <htslib/sam.h>
-#include <htslib/hts.h>
-#include <htslib/bgzf.h>
-#include <stdbool.h>
-
-#include "util/sds/sds.h"
-#include "util/prime_search.h"
-#include "util/hash.h"
-#include "util/util.h"
-
-#include <endian.h>
-
-// @abstract Structure representing an unique alignment and virtual file pointer
-// @field tid chromosome ID
-// @field pos 0-based leftmost coordinate
-// @field vptr virtual pointer
-typedef struct {
- uint32_t tid;
- uint32_t pos;
- uint64_t vptr;
-} aln_spec_t;
-
-
-#pragma pack(push, 1)
-
-typedef struct {
- uint32_t tid;
- uint32_t pos;
- uint64_t vptr;
-} aln_spec_packed_t;
-
-#pragma pack(pop)
-
-// constrains (qname < 2 ** 32)
-
-// @abstract Structure representing all alignments by queryname
-// @field qname queryname (name for the given read)
-// @field n_alns number of alignments
-// @field alns array with alignments structures
-typedef struct {
- char * qname;
- uint16_t n_alns;
- aln_spec_t * alns;
-} qidx_record_t;
-
-// @abstract Get the marshaled size of a record
-static size_t qidx_record_msize(qidx_record_t const * q) {
- return (
- strlen(q->qname) + 1 +
- sizeof(uint16_t) +
- sizeof(aln_spec_packed_t) * q->n_alns
- );
-}
-
-// @abstract Martial a record to a fixed buffer
-//
-// @param q query index record
-// @param dst destination buffer
-// @param dstlen length of the destination buffer
-//
-// @return returns the number of bytes written to the buffer
-// or < 0 if the destination buffer is exceeded
-ssize_t qidx_record_marital(qidx_record_t const * q, void * dst, size_t dstlen) {
- size_t rem = dstlen;
-
- size_t n = strlen(q->qname) + 1;
- if(rem < n)
- return -1;
- rem -= n;
-
- {
- char * _dst = dst;
- strcpy(_dst, q->qname);
- dst = (_dst + n);
- }
-
- size_t sz = 0;
- sz += sizeof(uint16_t);
- sz += sizeof(aln_spec_packed_t) * q->n_alns;
-
- if(rem < sz)
- return -1;
- rem -= sz;
-
- {
- uint16_t * _dst = dst;
- *_dst = htole16(q->n_alns);
- dst = _dst + 1;
- }
-
- aln_spec_packed_t * buf_aln = dst;
- for(size_t i = 0; i < q->n_alns; i++) {
- aln_spec_t * aln = &q->alns[i];
-
- buf_aln->tid = htole32(aln->tid);
- buf_aln->pos = htole32(aln->pos);
- buf_aln->vptr = htole64(aln->vptr);
-
- buf_aln++;
- }
-
- return dstlen - rem;
-}
-
-// @abstract Helper to parse the query name and the number of alignments
-// @returns NULL if a parsing error occured else the location after the buffer
-static void const * _parse_record_header(void const * buf, size_t buflen,
- char const ** qname, uint16_t * n_alns) {
-
- size_t n = strnlen(buf, buflen) + 1;
- *qname = buf;
-
- if(n + sizeof(uint16_t) > buflen)
- return NULL;
-
- uint16_t const * _n_alns = (uint16_t const *) (*qname + n);
- *n_alns = le64toh(*_n_alns);
- buf = _n_alns + 1;
-
- if(sizeof(aln_spec_packed_t) * *n_alns > buflen)
- return NULL;
-
- return buf;
-}
-
-// @abstract Unmartial a query record. The resulting memory needs to be freed.
-// All strings are sds strings.
-//
-// @param q query index record
-// @param buf source buffer
-// @param buflen source buffer length
-//
-// @return a pointer within buf after the record
-// or NULL if a parsing error occurred
-void const * qidx_record_unmartial(qidx_record_t * q, void const * buf, size_t buflen) {
- char const * qname;
-
- if(!(buf = _parse_record_header(buf, buflen, &qname, &q->n_alns))) {
- return NULL;
- }
-
- q->qname = sdsnew(qname);
- q->alns = calloc(q->n_alns, sizeof(aln_spec_t));
-
- aln_spec_packed_t const * buf_aln = buf;
- for(size_t i = 0; i < q->n_alns; i++) {
- aln_spec_t * aln = &q->alns[i];
-
- aln->pos = le32toh(buf_aln->pos);
- aln->tid = le32toh(buf_aln->tid);
- aln->vptr = le64toh(buf_aln->vptr);
-
- buf_aln++;
- }
-
- return buf_aln;
-}
-
-// @abstract Find a record efficiently in a marshaled buffer of query index records
-//
-// @param buf source buffer
-// @param buflen length of the source buffer
-// @param qname target queryname
-//
-// @returns a pointer to a target query or NULL if it a record with the target query name
-// does not exist in the buffer
-void const * qidx_record_martial_find(void const * buf, size_t buflen, char const * qname) {
- size_t n = strnlen(buf, buflen);
-
- char const * _qname;
- uint16_t n_alns;
-
- void const * cur = buf,
- * next = NULL;
-
- while((next = _parse_record_header(cur, buflen, &_qname, &n_alns))) {
-
- if(strcmp(_qname, qname) == 0) {
- return cur;
- }
-
- next = (aln_spec_packed_t *) next + n_alns;
- size_t l = (char const *) next - (char const *) cur;
- buflen -= l;
- cur = next;
-
- if(!buflen) {
- return NULL;
- }
- }
-
- // Invalid
- return NULL;
-}
-
-typedef enum {
- QIDX_OK = 0,
- QIDX_ITER_DONE,
- QIDX_NO_MEM,
- QIDX_NO_BAM_HDR,
- QIDX_BAM_READ_FAILURE,
- QIDX_REC_ITER_FAILURE,
- QIDX_NOT_SORTED,
- QIDX_BUCKET_MAX_SIZE_EXCEEDED,
- QIDX_MAP_FAIL,
- QIDX_IO_FAIL,
- QIDX_FAILED_TO_OPEN_BAMFILE,
- QIDX_FAILED_TO_OPEN_INDEX_FILE,
- QIDX_INVALID_VERSION,
- QIDX_INVALID_MAGIC,
-} qidx_err_t;
-
-bool qidx_errno(qidx_err_t err) {
- switch(err) {
- case QIDX_MAP_FAIL:
- case QIDX_IO_FAIL:
- return true;
- default:
- return false;
- }
-
-}
-
-char const * qidx_strerr(qidx_err_t err) {
- switch(err) {
- case QIDX_NO_MEM: return "memory was exausted";
- case QIDX_NO_BAM_HDR: return "parsing BAM header failed";
- case QIDX_BAM_READ_FAILURE: return "failed to read bamfile";
- case QIDX_REC_ITER_FAILURE: return "bam query index record iterator failed";
- case QIDX_NOT_SORTED: return "input bam file not sorted by query name";
- case QIDX_BUCKET_MAX_SIZE_EXCEEDED: return "maximum query size exceeded";
- case QIDX_MAP_FAIL: return "mmap failed";
- case QIDX_IO_FAIL: return "qidx io failure";
- case QIDX_FAILED_TO_OPEN_BAMFILE: return "failed to open bamfile";
- case QIDX_FAILED_TO_OPEN_INDEX_FILE: return "failed to open indexfile";
- case QIDX_INVALID_MAGIC: return "invalid magic number (not a qidx or corrupted)";
- case QIDX_INVALID_VERSION: return "qidx version outdated, please regenerate";
- case QIDX_ITER_DONE:
- case QIDX_OK:
- return "OK";
- }
-}
-
-qidx_err_t init_qidx(qidx_record_t * r) {
- return QIDX_OK;
-}
-
-void qidx_free(qidx_record_t * r) {
- free(r);
-}
-
-/*
- * SAMTOOLS SORTING ALGORITHM
- */
-
-// strnum_cmp is the string comparator used by samtools sort
-// to sort bam files by read names (e.g. QNAME)
-//
-// The input file is expected to be ordered accordingly. This
-// is verified at runtime.
-#define is_digit(c) ((c)<='9' && (c)>='0')
-static int strnum_cmp(const char *_a, const char *_b)
-{
- const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
- const unsigned char *pa = a, *pb = b;
- while (*pa && *pb) {
- if (!is_digit(*pa) || !is_digit(*pb)) {
- if (*pa != *pb)
- return (int)*pa - (int)*pb;
- ++pa; ++pb;
- } else {
- // skip leading zeros
- while (*pa == '0') ++pa;
- while (*pb == '0') ++pb;
-
- // skip matching digits
- while (is_digit(*pa) && *pa == *pb)
- pa++, pb++;
-
- // Now mismatching, so see which ends the number sooner
- int diff = (int)*pa - (int)*pb;
- while (is_digit(*pa) && is_digit(*pb))
- pa++, pb++;
-
- if (is_digit(*pa))
- return 1; // pa still going, so larger
- else if (is_digit(*pb))
- return -1; // pb still going, so larger
- else if (diff)
- return diff; // same length, so earlier diff
- }
- }
- return *pa? 1 : *pb? -1 : 0;
-}
-
-
-typedef struct {
- htsFile *fp;
- bam1_t * b;
- bam_hdr_t * hdr;
-
- uint64_t file_offset;
- sds queryname;
- int ret;
-
- aln_spec_t * aln_specs;
- uint16_t n_aln_cap;
- uint16_t n_aln_sz;
-
- qidx_record_t * pending_release;
-} qidx_record_iter_t;
-
-qidx_err_t qidx_record_iter_init(qidx_record_iter_t *it, htsFile *fp) {
- it->b = bam_init1();
- it->fp = fp;
-
- // advance past header
- it->hdr = sam_hdr_read(fp);
-
- if(!it->hdr) {
- return QIDX_NO_BAM_HDR;
- }
-
- it->file_offset = bgzf_tell(fp->fp.bgzf);
-
- if((it->ret = sam_read1(it->fp, it->hdr, it->b)) < 0) {
- return QIDX_BAM_READ_FAILURE;
- }
-
- it->queryname = sdsnew(bam_get_qname(it->b));
- it->aln_specs = NULL;
- it->n_aln_sz = it->n_aln_cap = 0;
-
- if(bgzf_seek(fp->fp.bgzf, it->file_offset, SEEK_SET) < 0) {
- return QIDX_IO_FAIL;
- }
-
- it->pending_release = NULL;
-
- return QIDX_OK;
-}
-
-static void _qidx_record_iter_free_record(qidx_record_iter_t * it) {
- if(it->pending_release) {
- free(it->pending_release->alns);
- sdsfree(it->pending_release->qname);
- it->pending_release = NULL;
- }
-}
-
-qidx_err_t qidx_record_iter_next(qidx_record_iter_t * it, qidx_record_t * rec) {
-
- _qidx_record_iter_free_record(it);
-
- sds queryname;
- int cmp;
- int ret;
- bool yield = false;
- while ((ret = sam_read1(it->fp, it->hdr, it->b)) >= 0) {
- queryname = sdsnew(bam_get_qname(it->b));
-
- if(!queryname) {
- return QIDX_NO_MEM;
- }
-
- if((cmp = strnum_cmp(it->queryname, queryname)) != 0) {
- if(cmp > 0) {
- printf("Err: the file is not sorted\n");
- return QIDX_NOT_SORTED;
- }
-
- rec->n_alns = it->n_aln_sz;
- rec->alns = it->aln_specs;
- rec->qname = it->queryname;
-
- it->pending_release = rec;
-
- yield = true;
-
- it->queryname = sdsdup(queryname);
- it->aln_specs = NULL;
- it->n_aln_cap = it->n_aln_sz = 0;
- }
-
- AM_RESIZE(it->aln_specs, it->n_aln_sz + 1, it->n_aln_cap);
-
- it->aln_specs[it->n_aln_sz].pos = it->b->core.pos;
- it->aln_specs[it->n_aln_sz].tid = it->b->core.tid;
- it->aln_specs[it->n_aln_sz].vptr = it->file_offset;
-
- it->n_aln_sz++;
-
- // update offset for next record
- it->file_offset = bgzf_tell(it->fp->fp.bgzf);
-
- sdsfree(queryname);
-
- if(yield) {
- break;
- }
- }
-
- if(ret < -1) {
- it->pending_release = NULL;
- return QIDX_BAM_READ_FAILURE;
- } else if (ret == -1 && it->queryname != NULL) {
- rec->n_alns = it->n_aln_sz;
- rec->alns = it->aln_specs;
- rec->qname = it->queryname;
-
- it->pending_release = rec;
-
- yield = true;
-
- it->queryname = NULL;
- it->aln_specs = NULL;
- it->n_aln_cap = it->n_aln_sz = 0;
- }
-
- if(!yield) {
- return QIDX_ITER_DONE;
- }
-
- return QIDX_OK;
-}
-
-qidx_err_t qidx_record_iter_destroy(qidx_record_iter_t * it) {
- _qidx_record_iter_free_record(it);
-
- sdsfree(it->queryname);
-
- bam_hdr_destroy(it->hdr);
- bam_destroy1(it->b);
-
- return QIDX_OK;
-}
-
-#include <math.h>
-
-// @abstract Structure containing summary statistics estimated using
-// a one-pass algorithm
-// @field mu mean
-// @field s sample variance
-// @field n number of entries
-typedef struct {
- double mu;
- double s;
- size_t n;
-} roll_est_params_t;
-
-// @abstract observe a data point
-void roll_est_add_evidence(roll_est_params_t * p, double x) {
- double delta = x - p->mu;
- p->mu += delta / (p->n + 1);
- p->s += (x - p->mu) * delta;
- p->n++;
-}
-
-// @abstract finalize the estimate
-void roll_est_finalize(roll_est_params_t * p) {
- p->s /= p->n;
-}
-
-// Calculates the maximum number of items placed into a single bucket
-// to not exceed the bucket_size with a probability specified by the
-// area to the right of the positive z-score z. E.g. at z=2 there is
-// 2.4% chance of the soft limit being exceeded given objects are
-// assigned uniformly to each bucket. (The cardinality of each bucket
-// is the same.)
-//
-// In order for the maximum size to (really) not be exceeded, the
-// binomial distribution of items in each bucket would need to be
-// included. It is recommended to set this soft limit much lower
-// than the hard limit to not exceed it.
-//
-// This is derived from the standard error as measured by the sampling
-// distribution.
-
-// @param soft_limit a soft limit on the number of variable-length items
-// stuffed into a single bucket
-// @param z the z-score with which this will happen
-// @param p the parameters of the normal distribution
-size_t estimate_n_buckets(double soft_limit, double z, roll_est_params_t const * p) {
- double n = sqrt(z * z * p->s + 4 * p->mu * soft_limit) - z * sqrt(p->s);
- n = n / (2 * p->mu);
-
- int max_bucket_size = floor(n * n);
-
- return (size_t) ceil((double) p->n / (double) max_bucket_size);
-}
-
-typedef struct {
- uint64_t offset;
- uint32_t n_bytes;
-} qidx_bucket_t;
-
-typedef struct {
- uint32_t n_buckets;
- uint32_t max_bucket_size;
- qidx_bucket_t * buckets;
- char * rb_start;
-} qidx_htable_t;
-
-#define QIDX_MAGIC 0x5d1de6b4
-#define QIDX_VERSION 1
-
-#pragma pack(push, 1)
-
-typedef struct {
- uint32_t magic;
- uint16_t version;
- uint32_t n_buckets;
- uint32_t max_bucket_size;
-} qidx_header_t;
-
-#pragma pack(pop)
-
-typedef struct {
- int fd;
- void * map;
- size_t len;
- bool read_only;
- qidx_header_t * hdr;
- qidx_bucket_t * buckets;
- qidx_htable_t * htab;
-} qidx_fp_t;
-
-size_t qidx_file_size(uint32_t n_buckets, uint32_t max_bucket_size) {
- size_t sz = 0;
-
- sz += sizeof(qidx_header_t);
- sz += sizeof(qidx_htable_t) * n_buckets;
- sz += (uint64_t) n_buckets * (uint64_t) max_bucket_size;
-
- return sz;
-}
-
-#include <sys/mman.h>
#include <unistd.h>
-// @abstract initialize htab, must first populate fp->buckets
-qidx_err_t _qidx_init_htab(qidx_fp_t * fp,
- uint32_t n_buckets, uint32_t max_bucket_size) {
- qidx_htable_t * htab = malloc(sizeof(qidx_htable_t));
- if(!htab) {
- return QIDX_NO_MEM;
- }
-
- htab->buckets = calloc(sizeof(qidx_bucket_t), n_buckets);
- if(!htab->buckets) {
- free(htab);
- return QIDX_NO_MEM;
- }
-
- htab->n_buckets = n_buckets;
- htab->max_bucket_size = max_bucket_size;
-
- htab->rb_start = (void *) (fp->buckets + n_buckets);
-
- fp->htab = htab;
-
- return QIDX_OK;
-}
-
-
-qidx_err_t qidx_open(qidx_fp_t * fp, int fd) {
-
- fp->fd = fd;
-
- qidx_header_t hdr;
- if(read(fd, &hdr, sizeof(qidx_header_t)) == -1) {
- return QIDX_FAILED_TO_OPEN_INDEX_FILE;
- }
-
- if(le32toh(hdr.magic) != QIDX_MAGIC) {
- return QIDX_INVALID_MAGIC;
- }
-
- if(le16toh(hdr.version) != QIDX_VERSION) {
- return QIDX_INVALID_VERSION;
- }
-
- uint32_t max_bucket_size = le32toh(hdr.max_bucket_size);
- uint32_t n_buckets = le32toh(hdr.n_buckets);
-
- size_t sz = qidx_file_size(n_buckets, max_bucket_size);
-
- if(lseek(fd, 0, SEEK_SET) == -1) {
- return QIDX_IO_FAIL;
- }
-
- fp->read_only = true;
- fp->map = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
- fp->len = sz;
-
- if(fp->map == MAP_FAILED) {
- return QIDX_MAP_FAIL;
- }
-
- fp->hdr = fp->map;
- fp->buckets = (qidx_bucket_t *)(fp->hdr + 1);
-
- qidx_err_t err = _qidx_init_htab(fp, n_buckets, max_bucket_size);
- if(err) { return err; }
-
- for(size_t i = 0; i < n_buckets; i++) {
- fp->htab->buckets[i].n_bytes = le32toh(fp->buckets[i].n_bytes);
- fp->htab->buckets[i].offset = le64toh(fp->buckets[i].offset);
- }
-
- return QIDX_OK;
-}
-
-qidx_err_t qidx_create(qidx_fp_t * fp, int fd, uint32_t _n_buckets, uint32_t max_bucket_size) {
-
- uint32_t n_buckets = next_prime_above_lowerbound(_n_buckets);
-
- size_t sz = qidx_file_size(n_buckets, max_bucket_size);
-
- fp->fd = fd;
-
- if(ftruncate(fd, sz) == -1) {
- return QIDX_IO_FAIL;
- }
-
- fp->read_only = false;
- fp->map = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- fp->len = sz;
-
- if(fp->map == MAP_FAILED) {
- return QIDX_MAP_FAIL;
- }
-
- fp->hdr = fp->map;
-
- fp->hdr->magic = htole32(QIDX_MAGIC);
- fp->hdr->version = htole16(QIDX_VERSION);
- fp->hdr->n_buckets = htole32(n_buckets);
- fp->hdr->max_bucket_size = htole32(max_bucket_size);
-
- fp->buckets = (qidx_bucket_t *) (fp->hdr + 1);
-
- _qidx_init_htab(fp, n_buckets, max_bucket_size);
-
- uint64_t off = (char *) fp->htab->rb_start - (char *) fp->map;
- for(size_t i = 0; i < n_buckets; i++) {
- fp->htab->buckets[i].offset = off;
- fp->htab->buckets[i].n_bytes = 0;
- off += max_bucket_size;
- }
-
- return QIDX_OK;
-}
-
-qidx_err_t qidx_close(qidx_fp_t * fp) {
-
- qidx_htable_t * htab = fp->htab;
-
- if(!fp->read_only) {
- for(size_t i = 0; i < htab->n_buckets; i++) {
- fp->buckets[i].n_bytes = htole32(htab->buckets[i].n_bytes);
- fp->buckets[i].offset = htole64(htab->buckets[i].offset);
- }
- }
-
- free(htab->buckets);
- free(htab);
-
- if(munmap(fp->map, fp->len) != -1) {
- return QIDX_MAP_FAIL;
- }
-
- return QIDX_OK;
-}
-
-qidx_htable_t * qidx_get_htable(qidx_fp_t * fp) { return fp->htab; }
-
-static qidx_bucket_t * _qidx_bucket(char const * qname, qidx_htable_t const * tab) {
- size_t hash = murmur_hash_str_64s(qname, 0x12345);
-
- qidx_bucket_t * bucket = &tab->buckets[hash % tab->n_buckets];
-
- return bucket;
-}
-
-qidx_err_t qidx_table_add(qidx_htable_t * tab, qidx_record_t const * rec) {
- qidx_bucket_t * bucket = _qidx_bucket(rec->qname, tab);
-
- void * ptr = (char *) tab->rb_start + bucket->offset + bucket->n_bytes;
- size_t rem = tab->max_bucket_size - bucket->n_bytes;
-
- ssize_t wrlen = qidx_record_marital(rec, ptr, rem);
-
- if(wrlen < 0) {
- return QIDX_BUCKET_MAX_SIZE_EXCEEDED;
- }
-
- bucket->n_bytes += wrlen;
-
- return QIDX_OK;
-}
-
-qidx_err_t qidx_table_get(qidx_htable_t const * tab,
- char const * qname, qidx_record_t ** rec) {
- qidx_bucket_t * bucket = _qidx_bucket(qname, tab);
-
- void const * bucket_start = tab->rb_start + bucket->offset;
- void const * record_start =
- qidx_record_martial_find(bucket_start, bucket->n_bytes, qname);
-
- if(!record_start) {
- *rec = NULL;
- } else {
- size_t rem = bucket->n_bytes -
- ((char const *) record_start - (char const *) bucket_start);
- *rec = malloc(sizeof(qidx_record_t));
-
- if(!*rec) { return QIDX_NO_MEM; }
-
- void const * next = qidx_record_unmartial(*rec, record_start, rem);
-
- if(!next) {
- return QIDX_IO_FAIL;
- }
- }
-
- return QIDX_OK;
-}
+#include "util/util.h"
+#include "qidx.h"
void qidx_die_err(qidx_err_t err) {
char const * errstr = qidx_strerr(err);
@@ -740,160 +14,10 @@ void qidx_die_err(qidx_err_t err) {
}
}
-void qidx_record_print(FILE * file, qidx_record_t * rec) {
- fprintf(file, "qname: %s\n", rec->qname);
- for(size_t i = 0; i < rec->n_alns; i++) {
- fprintf(file, "\tTID: %u\n", rec->alns[i].tid);
- fprintf(file, "\tPOS: %u\n", rec->alns[i].pos);
- fprintf(file, "\tVPTR: %lu\n", rec->alns[i].vptr);
- }
-}
-
-qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile) {
- qidx_err_t err;
-
- htsFile *b_fp = hts_open(bamfile, "r");
- if(b_fp == NULL) { return QIDX_FAILED_TO_OPEN_BAMFILE; }
-
- // Pass 1:
- // - estimate mu, s, n of the distribution of record sizes
- // - estimate the number of buckets needed in the hash table
- qidx_record_iter_t it;
- if((err = qidx_record_iter_init(&it, b_fp))) {
- return err;
- }
-
- roll_est_params_t params;
- memset(&params, 0, sizeof(roll_est_params_t));
- qidx_record_t rec;
- while(!(err = qidx_record_iter_next(&it, &rec))) {
- double sz = (double) qidx_record_msize(&rec);
- roll_est_add_evidence(&params, sz);
- }
- if(err != QIDX_ITER_DONE) return err;
-
- if ((err = qidx_record_iter_destroy(&it))) {
- return err;
- }
-
- roll_est_finalize(&params);
-
- double soft_limit = 8 * 1024;
- double z = 5;
- uint32_t n_buckets = estimate_n_buckets(soft_limit, z, &params);
- uint32_t max_bucket_size = 1 << 24;
-
- printf("record size (mean): %f\n", params.mu);
- printf("record size (variance): %f\n", params.s);
- printf("number of records: %zu\n", params.n);
- printf("number of buckets: %u\n", n_buckets);
-
- FILE * qidx = fopen(indexfile, "wb+");
-
- if(!qidx) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; }
-
- int fd = fileno(qidx);
-
- // PASS 2:
- // - add records to index
-
- // Rewind to start of BAM file
- if (-1 == bgzf_seek(b_fp->fp.bgzf, 0, SEEK_SET)) {
- return QIDX_IO_FAIL;
- }
-
- qidx_fp_t q_fp;
- if((err = qidx_create(&q_fp, fd, n_buckets, max_bucket_size))) {
- qidx_die_err(err);
- }
-
- qidx_htable_t * htab = qidx_get_htable(&q_fp);
-
- if ((err = qidx_record_iter_init(&it, b_fp))) {
- return err;
- }
-
- while((err = qidx_record_iter_next(&it, &rec)) == QIDX_OK) {
- qidx_table_add(htab, &rec);
- }
- if(err != QIDX_ITER_DONE) return err;
-
- if ((err = qidx_record_iter_destroy(&it))) {
- return err;
- }
-
- qidx_close(&q_fp);
- fclose(qidx);
- hts_close(b_fp);
-
- return QIDX_OK;
-}
-
-qidx_err_t qidx_search_index(char const * indexfile,
- char const * bamfile, char const * qname) {
- FILE * index = fopen(indexfile, "rb");
- if(!index) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; }
-
- int fd = fileno(index);
-
- qidx_fp_t q_fp;
- qidx_err_t err = qidx_open(&q_fp, fd);
- if(err) return err;
-
- qidx_htable_t * htab = qidx_get_htable(&q_fp);
-
- qidx_record_t * rec;
- if((err = qidx_table_get(htab, qname, &rec))) {
- return err;
- }
-
- if(rec) {
- htsFile * fp = hts_open(bamfile, "r");
- htsFile* out_fp = hts_open("-", "w");
-
-
- if(!fp) {
- return QIDX_FAILED_TO_OPEN_BAMFILE;
- }
-
- bam1_t * b = bam_init1();
- bam_hdr_t * hdr = sam_hdr_read(fp);
- for(size_t i = 0; i < rec->n_alns; i++) {
- aln_spec_t * aln = &rec->alns[i];
-
- if(bgzf_seek(fp->fp.bgzf, aln->vptr, SEEK_SET) < 0) {
- return QIDX_IO_FAIL;
- }
-
- int ret;
- if((ret = sam_read1(fp, hdr, b)) >= 0) {
-
- int ret = sam_write1(out_fp, hdr, b);
- if(ret < 0) {
- return QIDX_IO_FAIL;
- }
- }
- }
-
- sdsfree(rec->qname);
- free(rec->alns);
- free(rec);
-
- bam_hdr_destroy(hdr);
- bam_destroy1(b);
- hts_close(out_fp);
- hts_close(fp);
- } else {
- fprintf(stderr, "Could not find read: %s\n", qname);
- }
-
- return QIDX_OK;
-}
-
void die_usage(char const * prog) {
err("%s {create,search}", prog);
err(" create [bamfile] [indexfile]");
- die(" search [index] [read]");
+ die(" search [index] [bamfile] [read]");
}
int main(int argc, char ** argv) {
@@ -904,23 +28,54 @@ int main(int argc, char ** argv) {
char const * type = argv[1];
qidx_err_t err;
+ char const * prog = argv[0];
if(strcmp(type, "create") == 0) {
- if(argc != 4) die_usage(argv[0]);
- char const * bamfile = argv[2];
- char const * indexfile = argv[3];
- err = qidx_create_index(bamfile, indexfile);
- if(err) {
+ int option;
+ uint32_t n_buckets = 0;
+ uint32_t max_bucket_size = 1 << 18;
+
+ while((option = getopt(argc, argv, ":b:s:")) != -1) {
+ switch(option) {
+ case 'b':
+ if(1 != sscanf(optarg, "%u", &n_buckets)) {
+ die("could not parse -b");
+ }
+ break;
+ case 's':
+ if(1 != sscanf(optarg, "%u", &max_bucket_size)) {
+ die("could not parse -s");
+ }
+ break;
+ case ':':
+ die("option needs a value");
+ case '?':
+ die("Unknown option %c", optopt);
+ break;
+ }
+ }
+
+ if(argc - optind != 3) die_usage(prog);
+
+ char const * bamfile = argv[optind + 1];
+ char const * indexfile = argv[optind + 2];
+
+ if((err = qidx_create_index1(bamfile,
+ indexfile, n_buckets, max_bucket_size))) {
qidx_die_err(err);
}
+
} else if (strcmp(type, "search") == 0) {
- if(argc != 5) die_usage(argv[0]);
+ if(argc != 5) die_usage(prog);
+
char const * indexfile = argv[2];
char const * bamfile = argv[3];
char const * qname = argv[4];
+
if((err = qidx_search_index(indexfile, bamfile, qname))) {
qidx_die_err(err);
}
+
} else {
die_usage(argv[0]);
}
diff --git a/makefile b/makefile
index e412184..8328469 100644
--- a/makefile
+++ b/makefile
@@ -1,3 +1,40 @@
-main:
- gcc main.c ./util/hash.c ./util/prime_search.c -lhts -lm ./util/sds/sds.c ./util/err.c -g -o qidx -fsanitize=address
-.PHONY: main
+ifeq ($(PREFIX),)
+ PREFIX := /usr/local
+endif
+
+ifeq ($(BINDIR),)
+ BINDIR := /bin
+endif
+
+CMD=qidx
+
+CC=gcc
+
+DEBUG := -fsanitize=address -O2
+
+CFLAGS := -Wall -pedantic -g
+LDFLAGS := -lhts -lm
+
+SRC_EXT:=%.c
+
+_STD_BUILD=$(CC) $(CFLAGS) $(filter $(SRC_EXT) %.o, $^) -o $@
+STD_BUILD=$(_STD_BUILD) $(LDFLAGS)
+STD_COMPILE=$(_STD_BUILD) -c
+
+OBJS = ./util/hash.o
+OBJS += ./util/prime_search.o
+OBJS += ./util/prime_search.o
+OBJS += ./util/sds/sds.o
+OBJS += ./util/err.o
+OBJS += ./qidx.o
+
+make: $(CMD)
+
+$(CMD): ./main.c $(OBJS)
+ $(STD_BUILD)
+
+clean:
+ $(RM) $(CMD) $(OBJS)
+
+%.o: %.c
+ $(STD_COMPILE)
diff --git a/qidx.c b/qidx.c
new file mode 100644
index 0000000..bcee165
--- /dev/null
+++ b/qidx.c
@@ -0,0 +1,899 @@
+#include "qidx.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <htslib/sam.h>
+#include <htslib/bgzf.h>
+#include <endian.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <math.h>
+
+#include "util/util.h"
+#include "util/sds/sds.h"
+#include "util/prime_search.h"
+#include "util/hash.h"
+
+#pragma pack(push, 1)
+
+typedef struct {
+ uint32_t tid;
+ uint32_t pos;
+ uint64_t vptr;
+} aln_spec_packed_t;
+
+#pragma pack(pop)
+
+// @abstract Get the marshaled size of a record
+static size_t qidx_record_msize(qidx_record_t const * q) {
+ return (
+ strlen(q->qname) + 1 +
+ sizeof(uint16_t) +
+ sizeof(aln_spec_packed_t) * q->n_alns
+ );
+}
+
+// @abstract Martial a record to a fixed buffer
+//
+// @param q query index record
+// @param dst destination buffer
+// @param dstlen length of the destination buffer
+//
+// @return returns the number of bytes written to the buffer
+// or < 0 if the destination buffer is exceeded
+ssize_t qidx_record_marital(qidx_record_t const * q, void * dst, size_t dstlen) {
+ size_t rem = dstlen;
+
+ size_t n = strlen(q->qname) + 1;
+ if(rem < n)
+ return -1;
+ rem -= n;
+
+ {
+ char * _dst = dst;
+ strcpy(_dst, q->qname);
+ dst = (_dst + n);
+ }
+
+ size_t sz = 0;
+ sz += sizeof(uint16_t);
+ sz += sizeof(aln_spec_packed_t) * q->n_alns;
+
+ if(rem < sz)
+ return -1;
+ rem -= sz;
+
+ {
+ uint16_t * _dst = dst;
+ *_dst = htole16(q->n_alns);
+ dst = _dst + 1;
+ }
+
+ aln_spec_packed_t * buf_aln = dst;
+ for(size_t i = 0; i < q->n_alns; i++) {
+ aln_spec_t * aln = &q->alns[i];
+
+ buf_aln->tid = htole32(aln->tid);
+ buf_aln->pos = htole32(aln->pos);
+ buf_aln->vptr = htole64(aln->vptr);
+
+ buf_aln++;
+ }
+
+ return dstlen - rem;
+}
+
+// @abstract Helper to parse the query name and the number of alignments
+// @returns NULL if a parsing error occured else the location after the buffer
+static void const * _parse_record_header(void const * buf, size_t buflen,
+ char const ** qname, uint16_t * n_alns) {
+
+ size_t n = strnlen(buf, buflen) + 1;
+ *qname = buf;
+
+ if(n + sizeof(uint16_t) > buflen)
+ return NULL;
+
+ uint16_t const * _n_alns = (uint16_t const *) (*qname + n);
+ *n_alns = le64toh(*_n_alns);
+ buf = _n_alns + 1;
+
+ if(sizeof(aln_spec_packed_t) * *n_alns > buflen)
+ return NULL;
+
+ return buf;
+}
+
+// @abstract Unmartial a query record. The resulting memory needs to be freed.
+// All strings are sds strings.
+//
+// @param q query index record
+// @param buf source buffer
+// @param buflen source buffer length
+//
+// @return a pointer within buf after the record
+// or NULL if a parsing error occurred
+void const * qidx_record_unmartial(qidx_record_t * q, void const * buf, size_t buflen) {
+ char const * qname;
+
+ if(!(buf = _parse_record_header(buf, buflen, &qname, &q->n_alns))) {
+ return NULL;
+ }
+
+ q->qname = sdsnew(qname);
+ q->alns = calloc(q->n_alns, sizeof(aln_spec_t));
+
+ aln_spec_packed_t const * buf_aln = buf;
+ for(size_t i = 0; i < q->n_alns; i++) {
+ aln_spec_t * aln = &q->alns[i];
+
+ aln->pos = le32toh(buf_aln->pos);
+ aln->tid = le32toh(buf_aln->tid);
+ aln->vptr = le64toh(buf_aln->vptr);
+
+ buf_aln++;
+ }
+
+ return buf_aln;
+}
+
+// @abstract Find a record efficiently in a marshaled buffer of query index records
+//
+// @param buf source buffer
+// @param buflen length of the source buffer
+// @param qname target queryname
+//
+// @returns a pointer to a target query or NULL if it a record with the target query name
+// does not exist in the buffer
+void const * qidx_record_martial_find(void const * buf, size_t buflen, char const * qname) {
+ char const * _qname;
+ uint16_t n_alns;
+
+ void const * cur = buf,
+ * next = NULL;
+
+ while((next = _parse_record_header(cur, buflen, &_qname, &n_alns))) {
+
+ if(strcmp(_qname, qname) == 0) {
+ return cur;
+ }
+
+ next = (aln_spec_packed_t *) next + n_alns;
+ size_t l = (char const *) next - (char const *) cur;
+ buflen -= l;
+ cur = next;
+
+ if(!buflen) {
+ return NULL;
+ }
+ }
+
+ // Invalid
+ return NULL;
+}
+
+bool qidx_errno(qidx_err_t err) {
+ switch(err) {
+ case QIDX_MAP_FAIL:
+ case QIDX_IO_FAIL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+char const * qidx_strerr(qidx_err_t err) {
+ switch(err) {
+ case QIDX_NO_MEM: return "memory was exausted";
+ case QIDX_NO_BAM_HDR: return "parsing BAM header failed";
+ case QIDX_BAM_READ_FAILURE: return "failed to read bamfile";
+ case QIDX_REC_ITER_FAILURE: return "bam query index record iterator failed";
+ case QIDX_NOT_SORTED: return "input bam file not sorted by query name";
+ case QIDX_BUCKET_MAX_SIZE_EXCEEDED: return "maximum query size exceeded";
+ case QIDX_MAP_FAIL: return "mmap failed";
+ case QIDX_IO_FAIL: return "qidx io failure";
+ case QIDX_FAILED_TO_OPEN_BAMFILE: return "failed to open bamfile";
+ case QIDX_FAILED_TO_OPEN_INDEX_FILE: return "failed to open indexfile";
+ case QIDX_INVALID_MAGIC: return "invalid magic number (not a qidx or corrupted)";
+ case QIDX_INVALID_VERSION: return "qidx version outdated, please regenerate";
+ case QIDX_ITER_DONE:
+ case QIDX_OK:
+ return "OK";
+ default:
+ return "Unknown";
+ }
+
+}
+
+/*
+ * SAMTOOLS SORTING ALGORITHM
+ */
+
+// strnum_cmp is the string comparator used by samtools sort
+// to sort bam files by read names (e.g. QNAME)
+//
+// The input file is expected to be ordered accordingly. This
+// is verified at runtime.
+#define is_digit(c) ((c)<='9' && (c)>='0')
+static int strnum_cmp(const char *_a, const char *_b)
+{
+ const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
+ const unsigned char *pa = a, *pb = b;
+ while (*pa && *pb) {
+ if (!is_digit(*pa) || !is_digit(*pb)) {
+ if (*pa != *pb)
+ return (int)*pa - (int)*pb;
+ ++pa; ++pb;
+ } else {
+ // skip leading zeros
+ while (*pa == '0') ++pa;
+ while (*pb == '0') ++pb;
+
+ // skip matching digits
+ while (is_digit(*pa) && *pa == *pb)
+ pa++, pb++;
+
+ // Now mismatching, so see which ends the number sooner
+ int diff = (int)*pa - (int)*pb;
+ while (is_digit(*pa) && is_digit(*pb))
+ pa++, pb++;
+
+ if (is_digit(*pa))
+ return 1; // pa still going, so larger
+ else if (is_digit(*pb))
+ return -1; // pb still going, so larger
+ else if (diff)
+ return diff; // same length, so earlier diff
+ }
+ }
+ return *pa? 1 : *pb? -1 : 0;
+}
+
+typedef struct {
+ htsFile *fp;
+ bam1_t * b;
+ bam_hdr_t * hdr;
+
+ uint64_t file_offset;
+ sds queryname;
+ int ret;
+
+ aln_spec_t * aln_specs;
+ uint16_t n_aln_cap;
+ uint16_t n_aln_sz;
+
+ qidx_record_t * pending_release;
+} qidx_record_iter_t;
+
+qidx_err_t qidx_record_iter_init(qidx_record_iter_t *it, htsFile *fp) {
+ it->b = bam_init1();
+ it->fp = fp;
+
+ // advance past header
+ it->hdr = sam_hdr_read(fp);
+
+ if(!it->hdr) {
+ return QIDX_NO_BAM_HDR;
+ }
+
+ it->file_offset = bgzf_tell(fp->fp.bgzf);
+
+ if((it->ret = sam_read1(it->fp, it->hdr, it->b)) < 0) {
+ return QIDX_BAM_READ_FAILURE;
+ }
+
+ it->queryname = sdsnew(bam_get_qname(it->b));
+ it->aln_specs = NULL;
+ it->n_aln_sz = it->n_aln_cap = 0;
+
+ if(bgzf_seek(fp->fp.bgzf, it->file_offset, SEEK_SET) < 0) {
+ return QIDX_IO_FAIL;
+ }
+
+ it->pending_release = NULL;
+
+ return QIDX_OK;
+}
+
+static void _qidx_record_iter_free_record(qidx_record_iter_t * it) {
+ if(it->pending_release) {
+ free(it->pending_release->alns);
+ sdsfree(it->pending_release->qname);
+ it->pending_release = NULL;
+ }
+}
+
+qidx_err_t qidx_record_iter_next(qidx_record_iter_t * it, qidx_record_t * rec) {
+
+ _qidx_record_iter_free_record(it);
+
+ sds queryname;
+ int cmp;
+ int ret;
+ bool yield = false;
+ while ((ret = sam_read1(it->fp, it->hdr, it->b)) >= 0) {
+ queryname = sdsnew(bam_get_qname(it->b));
+
+ if(!queryname) {
+ return QIDX_NO_MEM;
+ }
+
+ if((cmp = strnum_cmp(it->queryname, queryname)) != 0) {
+ if(cmp > 0) {
+ printf("Err: the file is not sorted\n");
+ return QIDX_NOT_SORTED;
+ }
+
+ rec->n_alns = it->n_aln_sz;
+ rec->alns = it->aln_specs;
+ rec->qname = it->queryname;
+
+ it->pending_release = rec;
+
+ yield = true;
+
+ it->queryname = sdsdup(queryname);
+ it->aln_specs = NULL;
+ it->n_aln_cap = it->n_aln_sz = 0;
+ }
+
+ AM_RESIZE(it->aln_specs, it->n_aln_sz + 1, it->n_aln_cap);
+
+ it->aln_specs[it->n_aln_sz].pos = it->b->core.pos;
+ it->aln_specs[it->n_aln_sz].tid = it->b->core.tid;
+ it->aln_specs[it->n_aln_sz].vptr = it->file_offset;
+
+ it->n_aln_sz++;
+
+ // update offset for next record
+ it->file_offset = bgzf_tell(it->fp->fp.bgzf);
+
+ sdsfree(queryname);
+
+ if(yield) {
+ break;
+ }
+ }
+
+ if(ret < -1) {
+ it->pending_release = NULL;
+ return QIDX_BAM_READ_FAILURE;
+ } else if (ret == -1 && it->queryname != NULL) {
+ rec->n_alns = it->n_aln_sz;
+ rec->alns = it->aln_specs;
+ rec->qname = it->queryname;
+
+ it->pending_release = rec;
+
+ yield = true;
+
+ it->queryname = NULL;
+ it->aln_specs = NULL;
+ it->n_aln_cap = it->n_aln_sz = 0;
+ }
+
+ if(!yield) {
+ return QIDX_ITER_DONE;
+ }
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_record_iter_destroy(qidx_record_iter_t * it) {
+ _qidx_record_iter_free_record(it);
+
+ sdsfree(it->queryname);
+
+ bam_hdr_destroy(it->hdr);
+ bam_destroy1(it->b);
+
+ return QIDX_OK;
+}
+
+// @abstract Structure containing summary statistics estimated using
+// a one-pass algorithm
+// @field mu mean
+// @field s sample variance
+// @field n number of entries
+typedef struct {
+ double mu;
+ double s;
+ size_t n;
+} roll_est_params_t;
+
+// @abstract observe a data point
+void roll_est_add_evidence(roll_est_params_t * p, double x) {
+ double delta = x - p->mu;
+ p->mu += delta / (p->n + 1);
+ p->s += (x - p->mu) * delta;
+ p->n++;
+}
+
+// @abstract finalize the estimate
+void roll_est_finalize(roll_est_params_t * p) {
+ p->s /= p->n;
+}
+
+// Calculates the maximum number of items placed into a single bucket
+// to not exceed the bucket_size with a probability specified by the
+// area to the right of the positive z-score z. E.g. at z=2 there is
+// 2.4% chance of the soft limit being exceeded given objects are
+// assigned uniformly to each bucket. (The cardinality of each bucket
+// is the same.)
+//
+// In order for the maximum size to (really) not be exceeded, the
+// binomial distribution of items in each bucket would need to be
+// included. It is recommended to set this soft limit much lower
+// than the hard limit to not exceed it.
+//
+// This is derived from the standard error as measured by the sampling
+// distribution.
+
+// @param soft_limit a soft limit on the number of variable-length items
+// stuffed into a single bucket
+// @param z the z-score with which this will happen
+// @param p the parameters of the normal distribution
+size_t estimate_n_buckets(double soft_limit, double z, roll_est_params_t const * p) {
+ double n = sqrt(z * z * p->s + 4 * p->mu * soft_limit) - z * sqrt(p->s);
+ n = n / (2 * p->mu);
+
+ int max_bucket_size = floor(n * n);
+
+ return (size_t) ceil((double) p->n / (double) max_bucket_size);
+}
+
+typedef struct {
+ uint64_t offset;
+ uint32_t n_bytes;
+} qidx_bucket_t;
+
+#pragma pack(push, 1)
+
+typedef struct {
+ uint64_t offset;
+ uint32_t n_bytes;
+} qidx_bucket_packed_t;
+
+#pragma pack(pop)
+
+
+typedef struct {
+ uint32_t n_buckets;
+ uint32_t max_bucket_size;
+ qidx_bucket_t * buckets;
+ char * rb_start;
+} qidx_htable_t;
+
+#pragma pack(push, 1)
+
+typedef struct {
+ uint32_t magic;
+ uint16_t version;
+ uint32_t n_buckets;
+ uint32_t max_bucket_size;
+} qidx_header_t;
+
+#pragma pack(pop)
+
+struct qidx_fp {
+ int fd;
+ void * map;
+ size_t len;
+ bool read_only;
+ qidx_header_t * hdr;
+ qidx_bucket_packed_t * buckets;
+ qidx_htable_t * htab;
+};
+
+size_t qidx_file_size(uint32_t n_buckets, uint32_t max_bucket_size) {
+ size_t sz = 0;
+
+ sz += sizeof(qidx_header_t);
+ sz += (uint64_t) sizeof(qidx_bucket_packed_t) * (uint64_t) n_buckets;
+ sz += (uint64_t) n_buckets * (uint64_t) max_bucket_size;
+
+ return sz;
+}
+
+// @abstract initialize htab, must first populate fp->buckets
+qidx_err_t _qidx_init_htab(qidx_fp_t * fp,
+ uint32_t n_buckets, uint32_t max_bucket_size) {
+ qidx_htable_t * htab = malloc(sizeof(qidx_htable_t));
+ if(!htab) {
+ return QIDX_NO_MEM;
+ }
+
+ htab->buckets = calloc(sizeof(qidx_bucket_t), n_buckets);
+ if(!htab->buckets) {
+ free(htab);
+ return QIDX_NO_MEM;
+ }
+
+ htab->n_buckets = n_buckets;
+ htab->max_bucket_size = max_bucket_size;
+
+ htab->rb_start = (void *) (fp->buckets + n_buckets);
+
+ fp->htab = htab;
+
+ return QIDX_OK;
+}
+
+
+qidx_err_t qidx_open(qidx_fp_t ** _fp, int fd) {
+ qidx_fp_t * fp = malloc(sizeof(qidx_fp_t));
+ *_fp = fp;
+
+ fp->fd = fd;
+
+ qidx_header_t hdr;
+ if(read(fd, &hdr, sizeof(qidx_header_t)) == -1) {
+ return QIDX_FAILED_TO_OPEN_INDEX_FILE;
+ }
+
+ if(le32toh(hdr.magic) != QIDX_MAGIC) {
+ return QIDX_INVALID_MAGIC;
+ }
+
+ if(le16toh(hdr.version) != QIDX_VERSION) {
+ return QIDX_INVALID_VERSION;
+ }
+
+ uint32_t max_bucket_size = le32toh(hdr.max_bucket_size);
+ uint32_t n_buckets = le32toh(hdr.n_buckets);
+
+ size_t sz = qidx_file_size(n_buckets, max_bucket_size);
+
+ if(lseek(fd, 0, SEEK_SET) == -1) {
+ return QIDX_IO_FAIL;
+ }
+
+ fp->read_only = true;
+ fp->map = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
+ fp->len = sz;
+
+ if(fp->map == MAP_FAILED) {
+ return QIDX_MAP_FAIL;
+ }
+
+ fp->hdr = fp->map;
+ fp->buckets = (qidx_bucket_packed_t *)(fp->hdr + 1);
+
+ qidx_err_t err = _qidx_init_htab(fp, n_buckets, max_bucket_size);
+ if(err) { return err; }
+
+ for(size_t i = 0; i < n_buckets; i++) {
+ fp->htab->buckets[i].n_bytes = le32toh(fp->buckets[i].n_bytes);
+ fp->htab->buckets[i].offset = le64toh(fp->buckets[i].offset);
+ }
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_create(qidx_fp_t ** _fp, int fd, uint32_t _n_buckets, uint32_t max_bucket_size) {
+ qidx_fp_t * fp = malloc(sizeof(qidx_fp_t));
+ *_fp = fp;
+
+ uint32_t n_buckets = next_prime_above_lowerbound(_n_buckets);
+
+ size_t sz = qidx_file_size(n_buckets, max_bucket_size);
+
+ fp->fd = fd;
+
+ if(ftruncate(fd, sz) == -1) {
+ return QIDX_IO_FAIL;
+ }
+
+ fp->read_only = false;
+ fp->map = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ fp->len = sz;
+
+ if(fp->map == MAP_FAILED) {
+ return QIDX_MAP_FAIL;
+ }
+
+ fp->hdr = fp->map;
+
+ fp->hdr->magic = htole32(QIDX_MAGIC);
+ fp->hdr->version = htole16(QIDX_VERSION);
+ fp->hdr->n_buckets = htole32(n_buckets);
+ fp->hdr->max_bucket_size = htole32(max_bucket_size);
+
+ fp->buckets = (qidx_bucket_packed_t *) (fp->hdr + 1);
+
+ _qidx_init_htab(fp, n_buckets, max_bucket_size);
+
+ uint64_t off = 0;
+ for(size_t i = 0; i < n_buckets; i++) {
+ fp->htab->buckets[i].offset = off;
+ fp->htab->buckets[i].n_bytes = 0;
+
+ off += max_bucket_size;
+ }
+
+ // REVISIT
+ for(size_t i = n_buckets - 1; i > 0; i--) {
+ fp->htab->rb_start[fp->htab->buckets[i].offset] = '\1';
+ }
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_close(qidx_fp_t * fp) {
+
+ qidx_htable_t * htab = fp->htab;
+
+ if(!fp->read_only) {
+ for(size_t i = 0; i < htab->n_buckets; i++) {
+ fp->buckets[i].n_bytes = htole32(htab->buckets[i].n_bytes);
+ fp->buckets[i].offset = htole64(htab->buckets[i].offset);
+ }
+ }
+
+ free(htab->buckets);
+ free(htab);
+
+ if(munmap(fp->map, fp->len) != -1) {
+ return QIDX_MAP_FAIL;
+ }
+
+ free(fp);
+
+ return QIDX_OK;
+}
+
+qidx_htable_t * qidx_get_htable(qidx_fp_t * fp) { return fp->htab; }
+
+static qidx_bucket_t * _qidx_bucket(char const * qname, qidx_htable_t const * tab) {
+ size_t hash = murmur_hash_str_64s(qname, 0x12345);
+
+ qidx_bucket_t * bucket = &tab->buckets[hash % (size_t)(tab->n_buckets)];
+
+ return bucket;
+}
+
+qidx_err_t qidx_table_add(qidx_htable_t * tab, qidx_record_t const * rec) {
+ qidx_bucket_t * bucket = _qidx_bucket(rec->qname, tab);
+
+ void * ptr = (char *) tab->rb_start + bucket->offset + bucket->n_bytes;
+ size_t rem = tab->max_bucket_size - bucket->n_bytes;
+
+ ssize_t wrlen = qidx_record_marital(rec, ptr, rem);
+
+ if(wrlen < 0) {
+ return QIDX_BUCKET_MAX_SIZE_EXCEEDED;
+ }
+
+ bucket->n_bytes += wrlen;
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_table_get(qidx_htable_t const * tab,
+ char const * qname, qidx_record_t ** rec) {
+ qidx_bucket_t * bucket = _qidx_bucket(qname, tab);
+
+ void const * bucket_start = tab->rb_start + bucket->offset;
+ void const * record_start =
+ qidx_record_martial_find(bucket_start, bucket->n_bytes, qname);
+
+ if(!record_start) {
+ *rec = NULL;
+ } else {
+ size_t rem = bucket->n_bytes -
+ ((char const *) record_start - (char const *) bucket_start);
+ *rec = malloc(sizeof(qidx_record_t));
+
+ if(!*rec) { return QIDX_NO_MEM; }
+
+ void const * next = qidx_record_unmartial(*rec, record_start, rem);
+
+ if(!next) {
+ return QIDX_IO_FAIL;
+ }
+ }
+
+ return QIDX_OK;
+}
+
+void qidx_record_print(FILE * file, qidx_record_t * rec) {
+ fprintf(file, "qname: %s\n", rec->qname);
+ for(size_t i = 0; i < rec->n_alns; i++) {
+ fprintf(file, "\tTID: %u\n", rec->alns[i].tid);
+ fprintf(file, "\tPOS: %u\n", rec->alns[i].pos);
+ fprintf(file, "\tVPTR: %lu\n", rec->alns[i].vptr);
+ }
+}
+
+qidx_err_t qidx_estimate_buckets(htsFile * fp,
+ double soft_limit, double z, uint32_t * n_buckets) {
+
+ // Pass 1:
+ // - estimate mu, s, n of the distribution of record sizes
+ // - estimate the number of buckets needed in the hash table
+ qidx_record_iter_t it;
+ qidx_err_t err;
+ if((err = qidx_record_iter_init(&it, fp))) {
+ return err;
+ }
+
+ roll_est_params_t params;
+ memset(&params, 0, sizeof(roll_est_params_t));
+ qidx_record_t rec;
+ while(!(err = qidx_record_iter_next(&it, &rec))) {
+ double sz = (double) qidx_record_msize(&rec);
+ roll_est_add_evidence(&params, sz);
+ }
+ if(err != QIDX_ITER_DONE) return err;
+
+ if ((err = qidx_record_iter_destroy(&it))) {
+ return err;
+ }
+
+ roll_est_finalize(&params);
+
+ *n_buckets = estimate_n_buckets(soft_limit, z, &params);
+
+ printf("record size (mean): %f\n", params.mu);
+ printf("record size (variance): %f\n", params.s);
+ printf("number of records: %zu\n", params.n);
+ printf("number of buckets: %u\n", *n_buckets);
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_create_index3(htsFile * fp, int fd,
+ qidx_fp_t ** _q_fp, uint32_t n_buckets,
+ uint32_t max_bucket_size) {
+ qidx_err_t err;
+
+ if((err = qidx_create(_q_fp, fd, n_buckets, max_bucket_size))) {
+ return err;
+ }
+
+ qidx_fp_t * q_fp = *_q_fp;
+
+ qidx_htable_t * htab = qidx_get_htable(q_fp);
+
+ qidx_record_iter_t it;
+ if ((err = qidx_record_iter_init(&it, fp))) {
+ return err;
+ }
+
+ qidx_record_t rec;
+ while((err = qidx_record_iter_next(&it, &rec)) == QIDX_OK) {
+ qidx_table_add(htab, &rec);
+ }
+ if(err != QIDX_ITER_DONE) return err;
+
+ if ((err = qidx_record_iter_destroy(&it))) {
+ return err;
+ }
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_create_index2(htsFile * fp, int fd, qidx_fp_t ** q_fp,
+ double soft_limit, double z, uint32_t n_buckets,
+ uint32_t max_bucket_size) {
+ qidx_err_t err;
+
+ if(n_buckets == 0) {
+
+ err = qidx_estimate_buckets(fp, soft_limit, z, &n_buckets);
+
+ if(err) {
+ return err;
+ }
+
+ // Rewind to start of BAM file
+ if (-1 == bgzf_seek(fp->fp.bgzf, 0, SEEK_SET)) {
+ return QIDX_IO_FAIL;
+ }
+
+ }
+
+ if((err = qidx_create_index3(fp, fd, q_fp, n_buckets, max_bucket_size))) {
+ return err;
+ }
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_create_index1(char const * bamfile, char const * indexfile,
+ uint32_t n_buckets, uint32_t max_bucket_size) {
+ htsFile *b_fp = hts_open(bamfile, "r");
+ if(b_fp == NULL) { return QIDX_FAILED_TO_OPEN_BAMFILE; }
+
+ FILE * qidx_file = fopen(indexfile, "wb+");
+ if(!qidx_file) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; }
+ int fd = fileno(qidx_file);
+
+ double soft_limit = 16 * 1024;
+ double z = 5;
+
+ qidx_fp_t * q_fp;
+ qidx_err_t err;
+ if((err = qidx_create_index2(b_fp, fd, &q_fp,
+ soft_limit, z, n_buckets, max_bucket_size))) {
+ return err;
+ }
+
+ qidx_close(q_fp);
+ fclose(qidx_file);
+ hts_close(b_fp);
+
+ return QIDX_OK;
+}
+
+qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile) {
+ uint32_t max_bucket_size = 1 << 19;
+ return qidx_create_index1(bamfile, indexfile, 0, max_bucket_size);
+}
+
+qidx_err_t qidx_search_index(char const * indexfile,
+ char const * bamfile, char const * qname) {
+ FILE * index = fopen(indexfile, "rb");
+ if(!index) { return QIDX_FAILED_TO_OPEN_INDEX_FILE; }
+
+ int fd = fileno(index);
+
+ qidx_fp_t * q_fp;
+ qidx_err_t err = qidx_open(&q_fp, fd);
+ if(err) return err;
+
+ qidx_htable_t * htab = qidx_get_htable(q_fp);
+
+ qidx_record_t * rec;
+ if((err = qidx_table_get(htab, qname, &rec))) {
+ return err;
+ }
+
+ if(rec) {
+ htsFile * fp = hts_open(bamfile, "r");
+ htsFile* out_fp = hts_open("-", "w");
+
+
+ if(!fp) {
+ return QIDX_FAILED_TO_OPEN_BAMFILE;
+ }
+
+ bam1_t * b = bam_init1();
+ bam_hdr_t * hdr = sam_hdr_read(fp);
+ for(size_t i = 0; i < rec->n_alns; i++) {
+ aln_spec_t * aln = &rec->alns[i];
+
+ if(bgzf_seek(fp->fp.bgzf, aln->vptr, SEEK_SET) < 0) {
+ return QIDX_IO_FAIL;
+ }
+
+ int ret;
+ if((ret = sam_read1(fp, hdr, b)) >= 0) {
+
+ int ret = sam_write1(out_fp, hdr, b);
+ if(ret < 0) {
+ return QIDX_IO_FAIL;
+ }
+ }
+ }
+
+ sdsfree(rec->qname);
+ free(rec->alns);
+ free(rec);
+
+ bam_hdr_destroy(hdr);
+ bam_destroy1(b);
+ hts_close(out_fp);
+ hts_close(fp);
+ } else {
+ fprintf(stderr, "Could not find read: %s\n", qname);
+ }
+
+ if((err = qidx_close(q_fp))) {
+ return err;
+ }
+
+ return QIDX_OK;
+}
+
diff --git a/qidx.h b/qidx.h
new file mode 100644
index 0000000..fdc1046
--- /dev/null
+++ b/qidx.h
@@ -0,0 +1,70 @@
+#ifndef _QIDX
+#define _QIDX
+
+#include <htslib/hts.h>
+#include <stdbool.h>
+
+// @abstract Structure representing an unique alignment and virtual file pointer
+// @field tid chromosome ID
+// @field pos 0-based leftmost coordinate
+// @field vptr virtual pointer
+typedef struct {
+ uint32_t tid;
+ uint32_t pos;
+ uint64_t vptr;
+} aln_spec_t;
+
+// @abstract Structure representing all alignments by queryname
+// @field qname queryname (name for the given read)
+// @field n_alns number of alignments
+// @field alns array with alignments structures
+typedef struct {
+ char * qname;
+ uint16_t n_alns;
+ aln_spec_t * alns;
+} qidx_record_t;
+
+typedef enum {
+ QIDX_OK = 0,
+ QIDX_ITER_DONE,
+ QIDX_NO_MEM,
+ QIDX_NO_BAM_HDR,
+ QIDX_BAM_READ_FAILURE,
+ QIDX_REC_ITER_FAILURE,
+ QIDX_NOT_SORTED,
+ QIDX_BUCKET_MAX_SIZE_EXCEEDED,
+ QIDX_MAP_FAIL,
+ QIDX_IO_FAIL,
+ QIDX_FAILED_TO_OPEN_BAMFILE,
+ QIDX_FAILED_TO_OPEN_INDEX_FILE,
+ QIDX_INVALID_VERSION,
+ QIDX_INVALID_MAGIC,
+} qidx_err_t;
+
+bool qidx_errno(qidx_err_t err);
+
+char const * qidx_strerr(qidx_err_t err);
+
+#define QIDX_MAGIC 0x5d1de6b4
+#define QIDX_VERSION 1
+
+struct qidx_fp;
+typedef struct qidx_fp qidx_fp_t;
+
+qidx_err_t qidx_open(qidx_fp_t ** fp, int fd);
+qidx_err_t qidx_create(qidx_fp_t ** fp, int fd, uint32_t _n_buckets,
+ uint32_t max_bucket_size);
+qidx_err_t qidx_close(qidx_fp_t * fp);
+
+qidx_err_t qidx_create_index3(htsFile * fp, int fd, qidx_fp_t ** q_fp,
+ uint32_t n_buckets, uint32_t max_bucket_size);
+qidx_err_t qidx_create_index2(htsFile * fp, int fd, qidx_fp_t ** q_fp,
+ double soft_limit, double z, uint32_t n_buckets, uint32_t max_bucket_size);
+qidx_err_t qidx_create_index1(char const * bamfile, char const * indexfile, uint32_t n_buckets, uint32_t max_bucket_size);
+qidx_err_t qidx_create_index(char const * bamfile, char const * indexfile);
+
+// DO NOT USE
+qidx_err_t qidx_search_index(char const * indexfile,
+ char const * bamfile, char const * qname);
+
+#endif /* ifndef _QIDX */