commit 6eb4dd815927cb9bd913a744c2a9574360cd45ec
parent 87e8f988a6179fc6764a97c86f0053f538b4f006
Author: Walther Chen <walther.chen@gmail.com>
Date: Fri, 1 Nov 2024 09:22:17 -0400
ba1j, ba1l: frequent_words_with_mismatch, neighbors
Diffstat:
A | ba1j.c3 | | | 26 | ++++++++++++++++++++++++++ |
A | ba1l.c3 | | | 24 | ++++++++++++++++++++++++ |
M | util.c3 | | | 177 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 227 insertions(+), 0 deletions(-)
diff --git a/ba1j.c3 b/ba1j.c3
@@ -0,0 +1,26 @@
+// Frequent words with mismatches
+
+module ba1j;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+ if (args.len != 2) {
+ io::eprintn("Please supply path to data file");
+ return IoError.FILE_NOT_FOUND?;
+ }
+ File f = file::open(args[1], "rb")!;
+ String genome = io::treadline(&f)!;
+ String ints = io::treadline(&f)!;
+ String[] ints_split = ints.tsplit(" ");
+ int k = ints_split[0].to_integer(int)!;
+ int d = ints_split[1].to_integer(int)!;
+ String[] matches = util::frequent_words_with_mismatches(genome, k, d);
+ foreach (match : matches) {
+ io::printf("%s ", match);
+ }
+}
+
diff --git a/ba1l.c3 b/ba1l.c3
@@ -0,0 +1,24 @@
+// Neighbors
+
+module ba1l;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+ if (args.len != 2) {
+ io::eprintn("Please supply path to data file");
+ return IoError.FILE_NOT_FOUND?;
+ }
+ File f = file::open(args[1], "rb")!;
+ String pattern = io::treadline(&f)!;
+ int d = io::treadline(&f)!.to_integer(int)!;
+ String[] matches = util::neighbors(pattern, d);
+ foreach (match : matches) {
+ io::printf("%s ", match);
+ }
+}
+
+
diff --git a/util.c3 b/util.c3
@@ -1,5 +1,8 @@
module util;
import std::collections;
+import std::core::string;
+import std::io;
+import std::sort;
def FrequencyTable = HashMap(<String, int>);
@@ -120,4 +123,178 @@ fn void test_approximate_pattern_matching() @test {
}
}
+// Just like frequent words, except instead of matching just each kmer,
+// we create a neighbohood(d) for each kmer and that neighborhood is compared against the text
+fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
+ @pool() {
+ FrequencyTable freq_map;
+ freq_map.new_init(allocator: alloc);
+ for (int i = 0; i <= text.len - k; i += 1) {
+ String pattern = text[i:k];
+ String[] neighborhood = neighbors(pattern, d, allocator::temp());
+ foreach (neighbor : neighborhood) {
+ freq_map.@get_or_set(neighbor, 0); // don't need to clone, keys copied into table
+ if (try int* count = freq_map.get_ref(neighbor)) {
+ *count += 1;
+ }
+ }
+ }
+
+ int[] counts = freq_map.value_tlist();
+ int max = 0;
+ freq_map.@each(; String _k, int count) {
+ if (count > max) max = count;
+ };
+
+ List(<String>) res;
+ res.temp_init();
+ freq_map.@each(; String key, int v) {
+ if (v == max) {
+ res.push(key.copy(alloc));
+ }
+ };
+ return res.to_new_array(alloc);
+ };
+}
+
+struct FreqWordsMismatchesTest {
+ String text;
+ int k;
+ int d;
+ String[] expected;
+}
+fn void test_frequent_words_with_mismatches() @test {
+ FreqWordsMismatchesTest[] tests = {
+ {
+ .text = "ACGTTGCATGTCGCATGATGCATGAGAGCT",
+ .k = 4,
+ .d = 1,
+ .expected = {"ATGT", "GATG", "ATGC"},
+ },
+ {
+ .text = "AGGT",
+ .k = 2,
+ .d = 1,
+ .expected = {"GG"},
+ },
+ {
+ .text = "AGGGT",
+ .k = 2,
+ .d = 0,
+ .expected = {"GG"},
+ },
+ {
+ .text = "AGGCGG",
+ .k = 3,
+ .d = 0,
+ .expected = {"AGG", "GGC", "GCG", "CGG"},
+ },
+ };
+ foreach (t : tests) {
+ String[] matches = frequent_words_with_mismatches(t.text, t.k, t.d);
+ assert(matches.len == t.expected.len, "Expected %s, found %s", t.expected, matches);
+ bool found = false;
+ foreach (match : matches) {
+ foreach (exp : t.expected) {
+ found = true;
+ }
+ }
+ assert(found, "Expected %s, found %s", t.expected, matches);
+ }
+}
+
+// Build neighbors by building kmers over the neighborhood of each suffix.
+// So basically, start iterating from the last element (as the base suffix)
+// neighborhood, and build up kmers within d using the previous neighborhood +
+// a prefix element.
+fn String[] neighbors(String pattern, int d, Allocator alloc = allocator::heap())
+{
+ if (d == 0) return {pattern};
+ if (pattern.len == 0) return {};
+ List(<String>) neighborhood;
+ neighborhood.temp_init_with_array({"A", "C", "G", "T"});
+ List(<String>) suffix_neighborhood;
+ suffix_neighborhood.temp_init();
+ for (int i = 1; i < pattern.len; i += 1) {
+ // swap suffix_neighborhood and neighborhood. The previous
+ // neighborhood becomes the new suffix_neighborhood to iterate
+ // over. Now we need a place to collect the next neighborhood into,
+ // so we reuse the memory from the old suffix_neighborhood, which is
+ // no longer useful for future calculation, as its results have already
+ // been moved into the old neighborhood/new suffix_neighborhood as
+ // calculated during the last iteration. (otherwise we have to keep
+ // allocating new memory to collect the next neighborhood into).
+ List(<String>) swap_var;
+ swap_var = suffix_neighborhood;
+ suffix_neighborhood = neighborhood;
+ neighborhood = swap_var;
+ neighborhood.clear();
+
+ int suffix_idx = pattern.len - i;
+ String suffix = pattern[suffix_idx..];
+ //io::printfn("%s, suffix_neighborhood = %s, neighborhood = %s", suffix, suffix_neighborhood, neighborhood);
+ foreach (suffix_neighbor : suffix_neighborhood) {
+ //io::printfn("%s, %s, %s, %s", suffix, suffix_neighbor, suffix_neighborhood, neighborhood);
+ if (hamming_distance(suffix, suffix_neighbor) < d) {
+ // adding the additional base will be at most hamming dist == d,
+ // TODO we can reuse the memory of neighborhoods by swapping, can we
+ // do something about all the new strings?
+ neighborhood.push("A".tconcat(suffix_neighbor));
+ neighborhood.push("C".tconcat(suffix_neighbor));
+ neighborhood.push("G".tconcat(suffix_neighbor));
+ neighborhood.push("T".tconcat(suffix_neighbor));
+ } else {
+ // hamming distance == d. It can't be more, because the neighbors are
+ // generated up to hamming distance d. Adding the first symbol from the
+ // original pattern cannot increase the hamming distance.
+ neighborhood.push(pattern[suffix_idx - 1:1].tconcat(suffix_neighbor));
+ }
+ }
+ }
+ String[] res = allocator::new_array(alloc, String, neighborhood.len());
+ foreach (i, neighbor : neighborhood) {
+ res[i] = neighbor.copy(alloc);
+ }
+ return res;
+}
+struct NeighborsTest {
+ String text;
+ int d;
+ String[] expected;
+}
+fn void test_neighbors() @test {
+ NeighborsTest[] tests = {
+ {
+ .text = "ACG",
+ .d = 1,
+ .expected = {"ACG", "ACT", "ACA", "ACC", "ATG", "AGG", "AAG", "TCG", "GCG", "CCG"},
+ },
+ {
+ .text = "AGA",
+ .d = 0,
+ .expected = {"AGA"},
+ },
+ {
+ .text = "AAA",
+ .d = 1,
+ .expected = {"AAA", "AAC", "AAG", "AAT", "ACA", "AGA", "ATA", "CAA", "GAA", "TAA"},
+ },
+ {
+ .text = "A",
+ .d = 1,
+ .expected = {"A", "C", "T", "G"},
+ },
+ };
+ foreach (t : tests) {
+ String[] matches = neighbors(t.text, t.d);
+ assert(matches.len == t.expected.len, "Expected %s, found %s", t.expected, matches);
+ bool found = false;
+ foreach (match : matches) {
+ foreach (exp : t.expected) {
+ found = true;
+ }
+ }
+ assert(found, "Expected %s, found %s", t.expected, matches);
+ }
+}