Use of sparse vector for compressed DNA strings (2 bits per bp) Construction of comparison functions on bit-transposed compressed containers, benchmarking.
#include <assert.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include <utility>
#include "bmdbg.h"
using namespace std;
inline unsigned DNA2int(
char DNA_bp)
{
switch (DNA_bp)
{
case 'A':
return 0;
case 'T':
return 1;
case 'G':
return 2;
case 'C':
return 3;
case 'N':
return 4;
case '$':
return 5;
default:
assert(0);
return 0;
}
}
{
static char lut[] = { 'A', 'T', 'G', 'C', 'N', '$' };
if (code < 5)
return lut[code];
assert(0);
return 'N';
}
template<
typename SV>
void PrintSV(
const SV& sv)
{
cout << "size() = " << sv.size() << " : ";
auto it = sv.begin(); auto it_end = sv.end();
for (; it != it_end; ++it)
{
auto v = *it;
cout << bp;
}
cout << endl;
}
template<typename SV, typename VECT>
void TestSV(
const SV& sv,
const VECT& vect)
{
auto it = sv.begin(); auto it_end = sv.end();
auto itv = vect.begin(); auto itv_end = vect.end();
for (; it != it_end && itv != itv_end; ++it, ++itv)
{
auto v = *it;
char bpv = *itv;
assert(bp == bpv);
if (bp != bpv)
{
cerr << "Error: reference vector mismatch!" << endl;
exit(1);
}
}
if (it != it_end || itv != itv_end)
{
cerr << "Error: reference size mismatch!" << endl;
exit(1);
}
}
static
{
if (found)
{
auto v1 = sv1[pos];
auto v2 = sv2[pos];
if (v1 < v2)
return -1;
return 1;
}
return 0;
}
static
{
{
auto v1 = *it1; auto v2 = *it2;
if (v1 == v2)
continue;
if (v1 < v2)
return -1;
return 1;
}
return 0;
}
static
{
for (unsigned i = 0; i < sz; ++i)
{
unsigned code = rand() % 4;
assert(bp == 'A' || bp == 'T' || bp == 'G' || bp == 'C');
vect.push_back(bp);
bi = code;
}
bi.flush();
}
static
{
assert(csz);
assert(csz < sz);
{
vect[i] = 'N';
}
}
static
{
const unsigned case_size = 200000000;
const unsigned centr_size = 7000000;
}
static
unsigned m_count)
{
vect_m.resize(0);
if (!vect.size())
return;
vector_char_type::size_type sz = vect.size();
vector_char_type::size_type delta = sz / m_count;
for (vector_char_type::size_type i = 0; i < sz; i += delta)
{
vector_char_type::value_type v1 = vect[i];
unsigned code = rand() % 4;
vector_char_type::value_type v2 =
int2DNA(code);
if (v2 == v1)
continue;
vect_m.push_back(vector_pairs_type::value_type(i, code));
}
for (vector_char_type::size_type i = 1; i < sz / 4; i += (rand()%(1024 * 10)))
{
vector_char_type::value_type v1 = vect[i];
unsigned code = rand() % 4;
vector_char_type::value_type v2 =
int2DNA(code);
if (v2 == v1)
continue;
vect_m.push_back(vector_pairs_type::value_type(i, code));
}
}
static
{
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
assert(v1 != v2);
cnt += found;
assert(found);
assert(pos == i);
}
assert(cnt == vect_m.size());
}
static
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
vector_char_type::value_type v1 = vect[i];
vector_char_type::value_type v2 =
int2DNA(vect_m[k].second);
assert(v1 != v2);
vect1[i] = v2;
bool found = false;
vector_char_type::size_type pos;
auto it = vect.begin(); auto it_end = vect.end();
auto it1 = vect1.begin();
for (; it != it_end; ++it, ++it1)
{
if (*it != *it1)
{
found = true;
pos = vector_char_type::size_type(it - vect.begin());
break;
}
}
cnt += found;
assert(found);
assert(pos == i);
vect1[i] = v1;
}
assert(cnt == vect_m.size());
}
static
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
unsigned vs = unsigned(vect.size());
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
vector_char_type::value_type v1 = vect[i];
assert(vect_m[k].second < 4);
vector_char_type::value_type v2 =
int2DNA(vect_m[k].second);
assert(v1 != v2);
vect1[i] = v2;
const char* s1 = vect.data();
const char* s2 = vect1.data();
int res = ::strncmp(s1, s2, vs);
cnt += bool(res);
assert(res);
vect1[i] = v1;
}
assert(cnt == vect_m.size());
}
static
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
assert(v1 != v2);
assert(res != 0);
cnt += bool(res);
}
assert(cnt == vect_m.size());
}
inline
{
unsigned sz = (unsigned)vect_m.size();
unsigned cnt = 0;
for (unsigned k = 0; k < sz; ++k)
{
unsigned i = vect_m[k].first;
assert(v1 != v2);
cnt += bool(res);
assert(res != 0);
}
assert(cnt == vect_m.size());
}
{
try
{
{
const char* s = "ATGTCNNNNNTATA";
{
for (unsigned i = 0; s[i]; ++i)
{
}
bi.flush();
}
}
{
cout << "Generate benchmark test data." << endl;
cout << "generated mismatches=" << vect_m.size() << endl;
cout << "SV mismatch search test" << endl;
cout << "STL interator mismatch test" << endl;
cout << "strncmp() test" << endl;
cout << "SV compare test" << endl;
#if 0
cout << "SV compare iterators test" << endl;
#endif
}
std::cout << std::endl << "Performance:" << std::endl;
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}