void
checkUnitigMembership(UnitigVector &unitigs) {
uint32 *inUnitig = new uint32 [FI->numFragments()+1];
uint32 noUnitig = 0xffffffff;
// All reads start of not placed in a unitig.
for (uint32 i=0; i<FI->numFragments()+1; i++)
inUnitig[i] = noUnitig;
// Over all unitigs, remember where each read is.
for (uint32 ti=0; ti<unitigs.size(); ti++) {
Unitig *tig = unitigs[ti];
int32 len = 0;
if (tig == NULL)
continue;
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
if (frg->ident > FI->numFragments())
fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n",
tig->id(), fi, frg->ident, FI->numFragments());
if (inUnitig[frg->ident] != noUnitig)
fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n",
tig->id(), fi, frg->ident);
assert(frg->ident <= FI->numFragments()); // Can't be out of range.
assert(inUnitig[frg->ident] == noUnitig); // Read must be not placed yet.
inUnitig[frg->ident] = ti;
}
}
// Find any read not placed in a unitig.
for (uint32 i=0; i<FI->numFragments()+1; i++) {
if (FI->fragmentLength(i) == 0) // Deleted read.
continue;
assert(inUnitig[i] != 0); // There shouldn't be a unitig 0.
assert(inUnitig[i] != noUnitig); // The read should be in a unitig.
}
delete [] inUnitig;
}
static
void
makeNewUnitig(UnitigVector &unitigs,
uint32 splitFragsLen,
ufNode *splitFrags) {
Unitig *dangler = unitigs.newUnitig(false);
if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
writeLog("splitDiscontinuous()-- new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n",
dangler->id(), splitFragsLen, splitFrags[0].ident);
int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end);
// This should already be true, but we force it still
splitFrags[0].contained = 0;
for (uint32 i=0; i<splitFragsLen; i++)
dangler->addFrag(splitFrags[i], splitOffset, false); //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS));
}
// After splitting and ejecting some contains, check for discontinuous unitigs.
//
void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) {
writeLog("==> SPLIT DISCONTINUOUS\n");
uint32 numTested = 0;
uint32 numSplit = 0;
uint32 numCreated = 0;
uint32 splitFragsLen = 0;
uint32 splitFragsMax = 0;
ufNode *splitFrags = NULL;
for (uint32 ti=0; ti<unitigs.size(); ti++) {
Unitig *tig = unitigs[ti];
if ((tig == NULL) || (tig->ufpath.size() < 2))
continue;
// Unitig must be sorted. Someone upstream os screwing this up.
tig->sort();
// We'll want to build an array of new fragments to split out. This can be up
// to the size of the largest unitig.
splitFragsMax = MAX(splitFragsMax, tig->ufpath.size());
// Check that the unitig starts at position zero. Not critical for the next loop, but
// needs to be dome sometime.
int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
if (minPos == 0)
continue;
writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos);
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
frg->position.bgn -= minPos;
frg->position.end -= minPos;
}
}
splitFrags = new ufNode [splitFragsMax];
// Now, finally, we can check for gaps in unitigs.
for (uint32 ti=0; ti<unitigs.size(); ti++) {
Unitig *tig = unitigs[ti];
if ((tig == NULL) || (tig->ufpath.size() < 2))
continue;
// We don't expect many unitigs to be broken, so we'll do a first quick pass to just
// test if it is.
int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
bool isBroken = false;
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
int32 bgn = MIN(frg->position.bgn, frg->position.end);
int32 end = MAX(frg->position.bgn, frg->position.end);
if (bgn > maxEnd - minOverlap) {
isBroken = true;
break;
}
maxEnd = MAX(maxEnd, end);
}
numTested++;
if (isBroken == false)
continue;
numSplit++;
// Dang, busted unitig. Fix it up.
splitFragsLen = 0;
maxEnd = 0;
if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n",
tig->id(), tig->ufpath.size());
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
int32 bgn = MIN(frg->position.bgn, frg->position.end);
int32 end = MAX(frg->position.bgn, frg->position.end);
// Good thick overlap exists to this fragment, save it.
if (bgn <= maxEnd - minOverlap) {
assert(splitFragsLen < splitFragsMax);
splitFrags[splitFragsLen++] = *frg;
//.........这里部分代码省略.........
void
popBubbles(UnitigVector &unitigs,
double deviationBubble) {
BubTargetList potentialBubbles;
findPotentialBubbles(unitigs, potentialBubbles);
writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());
//if (potentialBubbles.size() == 0)
// return;
writeLog("\n");
writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());
writeLog("\n");
vector<overlapPlacement> *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble);
// We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_
// place for each bubble to be popped.
uint32 tiLimit = unitigs.size();
//uint32 tiNumThreads = omp_get_max_threads();
//uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;
// Clear flags.
for (uint32 ti=0; ti<tiLimit; ti++) {
if (unitigs[ti]) {
unitigs[ti]->_isBubble = false;
unitigs[ti]->_isRepeat = false;
}
}
// In parallel, process the placements.
for (uint32 ti=0; ti<tiLimit; ti++) {
if (potentialBubbles.count(ti) == 0) // Not a potential bubble
continue;
// Scan the bubble, decide if there are _ANY_ read placements. Log appropriately.
Unitig *bubble = unitigs[ti];
bool hasPlacements = false;
for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
uint32 readID = bubble->ufpath[fi].ident;
if (placed[readID].size() > 0)
hasPlacements = true;
}
if (hasPlacements == false)
writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti);
else
writeLog("potential bubble %u\n", ti);
// Split the placements into piles for each target and build an interval list for each target.
// For each read in the tig, convert the vector of placements into interval lists, one list per target tig.
map<uint32, intervalList<uint32> *> targetIntervals;
for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
uint32 readID = bubble->ufpath[fi].ident;
for (uint32 pp=0; pp<placed[readID].size(); pp++) {
uint32 tid = placed[readID][pp].tigID;
assert(placed[readID][pp].frgID > 0);
uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;
if (targetIntervals[tid] == NULL)
targetIntervals[tid] = new intervalList<uint32>;
//writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end);
targetIntervals[tid]->add(bgn, end-bgn);
}
}
vector<candidatePop *> targets;
// Squish the intervals. Create new candidatePops for each interval that isn't too big or
// small. Assign each overlapPlacements to the correct candidatePop.
for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) {
uint32 targetID = it->first;
intervalList<uint32> *IL = it->second;
IL->merge();
// Discard intervals that are significantly too small or large. Save the ones that are
// nicely sized. Logging here isn't terribly useful, it's just repeated (out of order) later
// when we try to make sense of the read alignments.
for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) || // Too small!
(1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) { // Too big!
//.........这里部分代码省略.........
void
findPotentialBubbles(UnitigVector &unitigs,
BubTargetList &potentialBubbles) {
uint32 tiLimit = unitigs.size();
uint32 tiNumThreads = omp_get_max_threads();
uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;
writeStatus("\n");
writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads);
for (uint32 ti=0; ti<tiLimit; ti++) {
Unitig *tig = unitigs[ti];
if ((tig == NULL) || // Not a tig, ignore it.
(tig->ufpath.size() == 1)) // Singleton, handled elsewhere.
continue;
uint32 nonContainedReads = 0;
bool validBubble = true;
map<uint32,uint32> tigOlapsTo;
uint32 fiLimit = tig->ufpath.size();
uint32 fiNumThreads = omp_get_max_threads();
uint32 fiBlockSize = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99;
for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) {
uint32 rid = tig->ufpath[fi].ident;
if (OG->isContained(rid) == true) // Don't need to check contained reads. If their container
continue; // passes the tests below, the contained read will too.
nonContainedReads++;
uint32 ovlLen = 0;
BAToverlap *ovl = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen);
set<uint32> readOlapsTo;
for (uint32 oi=0; oi<ovlLen; oi++) {
uint32 ovlTigID = Unitig::fragIn(ovl[oi].b_iid);
Unitig *ovlTig = unitigs[ovlTigID];
// Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself,
// or to a unitig that is shorter than us. We can not pop this tig as a bubble
// in any of those cases.
if ((ovlTigID == 0) ||
(ovlTig == NULL) ||
(ovlTig->ufpath.size() == 1) ||
(ovlTig->id() == tig->id()) ||
(ovlTig->getLength() < tig->getLength()))
continue;
// Otherwise, remember that we had an overlap to ovlTig.
//writeLog("tig %u read %u overlap to tig %u read %u\n",
// tig->id(), rid, ovlTigID, ovl[oi].b_iid);
readOlapsTo.insert(ovlTigID);
}
//writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size());
// Transfer the per-read counts to the per-unitig counts: add one to the counter for each tig
// that we have overlaps to.
for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it)
tigOlapsTo[*it]++;
// Decide if we're a valid potential bubble. If tig id (in it->first) has overlaps to every
// read we've seen so far (nonContainedReads), we're still a valid bubble.
//
// To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads
// have overlaps.
validBubble = false;
for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
validBubble = true;
// If we've not seen that many reads, pretend it's a valid bubble. It'll get screened out later.
if (nonContainedReads < 16)
validBubble = true;
}
// If not validBubble, report.
#if 0
if (validBubble == false) {
writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads);
for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
writeLog(" to tig %8u overlaps %6u\n", it->first, it->second);
}
#endif
// If validBubble, then there is a tig that every dovetail read has at least one overlap to.
//.........这里部分代码省略.........
intersectionList::intersectionList(UnitigVector &unitigs) {
for (uint32 ti=0; ti<unitigs.size(); ti++) {
Unitig *tig = unitigs[ti];
if (tig == NULL)
continue;
intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()];
for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
ufNode *frg = &tig->ufpath[fi];
if (OG->isContained(frg->ident))
continue;
// For my best overlap, the ID of the unitig that the overlapping fragment is in.
evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false);
evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true);
evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId());
evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId());
// Do NOT initialize these! An earlier fragment could have already confirmed an end.
// Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be
// confirmed already (otherwise the tig is nonsense), but we don't yet check that.
//
//evidence[fi].frag5confirmed = false;
//evidence[fi].frag3confirmed = false;
// But, because the path could be promiscuous, not every overlap to a different tig is bad.
//
// If my best overlap is to a different tig, but there is an overlapping fragment (in the
// unitig placement) with a best edge to me, I'm still good. The BOG build this unitig using
// the edge from the other fragment to me.
//
// If the fragments do not overlap in the layout (yet the best edge still exists) that is a
// self-intersection.
//
// The two blocks are identical, except for 'edge3' and 'edge5'.
if (evidence[fi].frag5tig == tig->id()) {
uint32 ti = tig->pathPosition(evidence[fi].edge5.fragId());
ufNode *trg = &tig->ufpath[ti];
uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end;
uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn;
uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end;
uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn;
// If they overlap, mark as confirmed, else remember an intersection.
if (((minf < mint) && (mint < maxf)) || // t begins inside f
((mint < minf) && (minf < maxt))) { // f begins inside t
if (evidence[fi].edge5.frag3p())
evidence[ti].frag3confirmed = true;
else
evidence[ti].frag5confirmed = true;
} else {
evidence[fi].frag5self = true;
// Not the correct place to report this. Some of these get confirmed by later fragments.
//writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt);
//writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n",
// tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5);
}
}
if (evidence[fi].frag3tig == tig->id()) {
uint32 ti = tig->pathPosition(evidence[fi].edge3.fragId());
ufNode *trg = &tig->ufpath[ti];
uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end;
uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn;
uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end;
uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn;
if (((minf < mint) && (mint < maxf)) || // t begins inside f
((mint < minf) && (minf < maxt))) { // f begins inside t
if (evidence[fi].edge3.frag3p())
evidence[ti].frag3confirmed = true;
else
evidence[ti].frag5confirmed = true;
} else {
evidence[fi].frag3self = true;
// Not the correct place to report this. Some of these get confirmed by later fragments.
//writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt);
//writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n",
// tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5);
}
}
}
//.........这里部分代码省略.........
static
void
joinUnitigs_append(UnitigVector &unitigs, joinEntry *join) {
uint32 frId = Unitig::fragIn(join->frFragID);
uint32 toId = Unitig::fragIn(join->toFragID);
Unitig *fr = unitigs[frId];
Unitig *to = unitigs[toId];
uint32 frIdx = Unitig::pathPosition(join->frFragID);
uint32 toIdx = Unitig::pathPosition(join->toFragID);
// The 'fr' unitig is assumed to be forward, and assumed to be the one we join to.
// Compute the offset for our append. We just need to compute where the join fragment would
// appear in the unitig. The join fragment MUST be the first thing in the frUnitig.
//int32 offset = MIN(frF.position.bgn, frF.position.end);
// Over all fragments in the frUnitig, add them to either the joinUnitig or the discUnitig.
Unitig *joinUnitig = unitigs.newUnitig(false);
Unitig *discUnitig = unitigs.newUnitig(false);
// Reverse the 'to' unitig if needed.
if (join->toFlip)
to->reverseComplement(true);
// If we're joining off the 5' end of the fr untiig, add the to reads first.
if (join->frFirst == true) {
uint32 ii=0;
for (; ii < toIdx; ii++)
joinUnitig->addFrag(to->ufpath[ii], 0, false);
for (; ii < to->ufpath.size(); ii++)
discUnitig->addFrag(to->ufpath[ii], 0, false);
}
// Now add all the fr unitig reads.
for (uint32 ii=0; ii < fr->ufpath.size(); ii++)
joinUnitig->addFrag(to->ufpath[ii], 0, false);
// If we're not joining off the 5' end, add the to unitig reads last.
if (join->frFirst == false) {
uint32 ii = 0;
for (; ii < toIdx; ii++)
discUnitig->addFrag(to->ufpath[ii], 0, false);
for (; ii < to->ufpath.size(); ii++)
joinUnitig->addFrag(to->ufpath[ii], 0, false);
}
// Delete the donor unitigs.
delete fr;
delete to;
unitigs[frId] = NULL;
unitigs[toId] = NULL;
// And make sure the new unitigs are consistent.
joinUnitig->sort();
discUnitig->sort();
}
请发表评论