GATK 是由 Broad Institute 开发的一个用于处理高通量测序数据的工具套件,广泛用于变异检测和基因组分析。IntervalLocusIterator
和 ShardedIntervalIterator 是 Genome Analysis Toolkit (GATK) 中的类,主要用于在指定的基因组区域(intervals)内遍历基因组位置。这使得在大规模数据处理时可以更有效地访问和操作特定的基因组区域。 ShardedIntervalIterator类在一个 SimpleInterval内按shard大小(一般是1个碱基)遍历基因组;IntervalLocusIterator类通过调用
ShardedIntervalIterator类可以遍历多个intervals。
ShardedIntervalIterator源代码:
package org.broadinstitute.hellbender.utils.iterators;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Iterator that will break up each input interval into shards. The advantage of this iterator is that it is not RAM
* intensive.
*
* Empty iterator of intervals is supported.
*
* Developer note: This class queues up the next interval to return and then has (minimal) code to detect when
* an empty iterator was passed in for initialization.
*/
public class ShardedIntervalIterator implements Iterator<SimpleInterval> {
private Iterator<SimpleInterval> intervals;
private int shardSize;
/**
* The input interval that we are currently sharding.
*/
private SimpleInterval currentInterval;
private int currentOffsetInCurrentInterval;
private int lastOffsetInCurrentInterval;
private SimpleInterval shardedInterval;
public ShardedIntervalIterator(Iterator<SimpleInterval> intervals, int shardSizeInBases) {
Utils.validate(shardSizeInBases > 0, "Invalid shard size. Must be greater than zero.");
this.intervals = intervals;
this.shardSize = shardSizeInBases;
this.currentOffsetInCurrentInterval = 0;
this.lastOffsetInCurrentInterval = 0;
this.currentInterval = null;
this.shardedInterval = null;
// Queue up the next interval to shard or keep it as null if the interval iterator is exhausted.
advanceInterval();
}
/**
* This should only be called when the shards within an interval have been exhausted. I.e. when starting a new
* interval to be sharded.
*/
private void advanceInterval() {
if (this.intervals.hasNext()) {
currentInterval = this.intervals.next();
currentOffsetInCurrentInterval = IntervalUtils.shardIndex(1, shardSize);
lastOffsetInCurrentInterval = IntervalUtils.shardIndex(this.currentInterval.size(), shardSize);
shardedInterval = calculateShardedInterval();
} else {
lastOffsetInCurrentInterval = 0;
currentInterval = null;
shardedInterval = null;
}
}
private SimpleInterval calculateShardedInterval() {
return new SimpleInterval(this.currentInterval.getContig(), currentInterval.getStart() + IntervalUtils.beginOfShard(currentOffsetInCurrentInterval, shardSize) - 1,
Integer.min(currentInterval.getStart() + IntervalUtils.endOfShard(currentOffsetInCurrentInterval, shardSize) - 1, currentInterval.getEnd()));
}
@Override
public boolean hasNext() {
// if the current shard has exhausted the current interval AND there is no next interval, return false
if (shardedInterval != null) {
return true;
}
return false;
}
@Override
public SimpleInterval next() {
if (shardedInterval == null) {
throw new NoSuchElementException();
}
final SimpleInterval result = shardedInterval;
// Advance the shard index and (if necessary) set it back to zero and get the next interval.
advanceShardInInterval();
return result;
}
/**
* This can also advance the interval if needed.
*/
private void advanceShardInInterval() {
currentOffsetInCurrentInterval ++;
if (currentOffsetInCurrentInterval > lastOffsetInCurrentInterval) {
// Advance the interval.
advanceInterval();
}
if (currentInterval != null) {
shardedInterval = calculateShardedInterval();
} else {
shardedInterval = null;
}
}
}
IntervalLocusIterator
源代码:
package org.broadinstitute.hellbender.utils.iterators;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Returns a SimpleInterval for each locus in a set of intervals. I.e. returns each genomic point location in an interval list.
*/
public class IntervalLocusIterator implements Iterator<SimpleInterval> {
private Iterator<SimpleInterval> intervalIterator;
public SimpleInterval currentInterval = null;
/** Encoded as an interval that i of size 1 in the current interval */
private Iterator<SimpleInterval> baseLocationIterator;
public IntervalLocusIterator(final Iterator<SimpleInterval> intervalIterator) {
Utils.nonNull(intervalIterator, "Input iterator cannot be null");
this.intervalIterator = intervalIterator;
advanceCurrentInterval();
}
@Override
public boolean hasNext() {
if (currentInterval == null) {
return false;
}
if (!baseLocationIterator.hasNext()) {
if (!intervalIterator.hasNext()) {
return false;
}
}
return true;
}
@Override
public SimpleInterval next() {
if (baseLocationIterator.hasNext()) {
return baseLocationIterator.next();
}
if (intervalIterator.hasNext()) {
advanceCurrentInterval();
return baseLocationIterator.next();
}
throw new NoSuchElementException();
}
private void advanceCurrentInterval() {
if (intervalIterator.hasNext()) {
currentInterval = intervalIterator.next();
} else {
// Typically, this code block should only get hit when this class is
// initalized with an empty interval list.
currentInterval = null;
}
baseLocationIterator = createBaseLocationIterator(currentInterval);
}
private Iterator<SimpleInterval> createBaseLocationIterator(final SimpleInterval fullInterval) {
if (fullInterval == null) {
return Collections.<SimpleInterval>emptyList().iterator();
}
List<SimpleInterval> dummyList = Collections.singletonList(fullInterval);
return new ShardedIntervalIterator(dummyList.iterator(), 1);
}
}