commit 6eb4dd815927cb9bd913a744c2a9574360cd45ec
parent 87e8f988a6179fc6764a97c86f0053f538b4f006
Author: Walther Chen <walther.chen@gmail.com>
Date:   Fri,  1 Nov 2024 09:22:17 -0400

ba1j, ba1l: frequent_words_with_mismatch, neighbors

Diffstat:
Aba1j.c3 | 26++++++++++++++++++++++++++
Aba1l.c3 | 24++++++++++++++++++++++++
Mutil.c3 | 177+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 227 insertions(+), 0 deletions(-)

diff --git a/ba1j.c3 b/ba1j.c3 @@ -0,0 +1,26 @@ +// Frequent words with mismatches + +module ba1j; +import std::io; +import std::io::file; +import std::collections; + +import util; + +fn void! main(String[] args) { + if (args.len != 2) { + io::eprintn("Please supply path to data file"); + return IoError.FILE_NOT_FOUND?; + } + File f = file::open(args[1], "rb")!; + String genome = io::treadline(&f)!; + String ints = io::treadline(&f)!; + String[] ints_split = ints.tsplit(" "); + int k = ints_split[0].to_integer(int)!; + int d = ints_split[1].to_integer(int)!; + String[] matches = util::frequent_words_with_mismatches(genome, k, d); + foreach (match : matches) { + io::printf("%s ", match); + } +} + diff --git a/ba1l.c3 b/ba1l.c3 @@ -0,0 +1,24 @@ +// Neighbors + +module ba1l; +import std::io; +import std::io::file; +import std::collections; + +import util; + +fn void! main(String[] args) { + if (args.len != 2) { + io::eprintn("Please supply path to data file"); + return IoError.FILE_NOT_FOUND?; + } + File f = file::open(args[1], "rb")!; + String pattern = io::treadline(&f)!; + int d = io::treadline(&f)!.to_integer(int)!; + String[] matches = util::neighbors(pattern, d); + foreach (match : matches) { + io::printf("%s ", match); + } +} + + diff --git a/util.c3 b/util.c3 @@ -1,5 +1,8 @@ module util; import std::collections; +import std::core::string; +import std::io; +import std::sort; def FrequencyTable = HashMap(<String, int>); @@ -120,4 +123,178 @@ fn void test_approximate_pattern_matching() @test { } } +// Just like frequent words, except instead of matching just each kmer, +// we create a neighbohood(d) for each kmer and that neighborhood is compared against the text +fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) { + @pool() { + FrequencyTable freq_map; + freq_map.new_init(allocator: alloc); + for (int i = 0; i <= text.len - k; i += 1) { + String pattern = text[i:k]; + String[] neighborhood = neighbors(pattern, d, allocator::temp()); + foreach (neighbor : neighborhood) { + freq_map.@get_or_set(neighbor, 0); // don't need to clone, keys copied into table + if (try int* count = freq_map.get_ref(neighbor)) { + *count += 1; + } + } + } + + int[] counts = freq_map.value_tlist(); + int max = 0; + freq_map.@each(; String _k, int count) { + if (count > max) max = count; + }; + + List(<String>) res; + res.temp_init(); + freq_map.@each(; String key, int v) { + if (v == max) { + res.push(key.copy(alloc)); + } + }; + return res.to_new_array(alloc); + }; +} + +struct FreqWordsMismatchesTest { + String text; + int k; + int d; + String[] expected; +} +fn void test_frequent_words_with_mismatches() @test { + FreqWordsMismatchesTest[] tests = { + { + .text = "ACGTTGCATGTCGCATGATGCATGAGAGCT", + .k = 4, + .d = 1, + .expected = {"ATGT", "GATG", "ATGC"}, + }, + { + .text = "AGGT", + .k = 2, + .d = 1, + .expected = {"GG"}, + }, + { + .text = "AGGGT", + .k = 2, + .d = 0, + .expected = {"GG"}, + }, + { + .text = "AGGCGG", + .k = 3, + .d = 0, + .expected = {"AGG", "GGC", "GCG", "CGG"}, + }, + }; + foreach (t : tests) { + String[] matches = frequent_words_with_mismatches(t.text, t.k, t.d); + assert(matches.len == t.expected.len, "Expected %s, found %s", t.expected, matches); + bool found = false; + foreach (match : matches) { + foreach (exp : t.expected) { + found = true; + } + } + assert(found, "Expected %s, found %s", t.expected, matches); + } +} + +// Build neighbors by building kmers over the neighborhood of each suffix. +// So basically, start iterating from the last element (as the base suffix) +// neighborhood, and build up kmers within d using the previous neighborhood + +// a prefix element. +fn String[] neighbors(String pattern, int d, Allocator alloc = allocator::heap()) +{ + if (d == 0) return {pattern}; + if (pattern.len == 0) return {}; + List(<String>) neighborhood; + neighborhood.temp_init_with_array({"A", "C", "G", "T"}); + List(<String>) suffix_neighborhood; + suffix_neighborhood.temp_init(); + for (int i = 1; i < pattern.len; i += 1) { + // swap suffix_neighborhood and neighborhood. The previous + // neighborhood becomes the new suffix_neighborhood to iterate + // over. Now we need a place to collect the next neighborhood into, + // so we reuse the memory from the old suffix_neighborhood, which is + // no longer useful for future calculation, as its results have already + // been moved into the old neighborhood/new suffix_neighborhood as + // calculated during the last iteration. (otherwise we have to keep + // allocating new memory to collect the next neighborhood into). + List(<String>) swap_var; + swap_var = suffix_neighborhood; + suffix_neighborhood = neighborhood; + neighborhood = swap_var; + neighborhood.clear(); + + int suffix_idx = pattern.len - i; + String suffix = pattern[suffix_idx..]; + //io::printfn("%s, suffix_neighborhood = %s, neighborhood = %s", suffix, suffix_neighborhood, neighborhood); + foreach (suffix_neighbor : suffix_neighborhood) { + //io::printfn("%s, %s, %s, %s", suffix, suffix_neighbor, suffix_neighborhood, neighborhood); + if (hamming_distance(suffix, suffix_neighbor) < d) { + // adding the additional base will be at most hamming dist == d, + // TODO we can reuse the memory of neighborhoods by swapping, can we + // do something about all the new strings? + neighborhood.push("A".tconcat(suffix_neighbor)); + neighborhood.push("C".tconcat(suffix_neighbor)); + neighborhood.push("G".tconcat(suffix_neighbor)); + neighborhood.push("T".tconcat(suffix_neighbor)); + } else { + // hamming distance == d. It can't be more, because the neighbors are + // generated up to hamming distance d. Adding the first symbol from the + // original pattern cannot increase the hamming distance. + neighborhood.push(pattern[suffix_idx - 1:1].tconcat(suffix_neighbor)); + } + } + } + String[] res = allocator::new_array(alloc, String, neighborhood.len()); + foreach (i, neighbor : neighborhood) { + res[i] = neighbor.copy(alloc); + } + return res; +} +struct NeighborsTest { + String text; + int d; + String[] expected; +} +fn void test_neighbors() @test { + NeighborsTest[] tests = { + { + .text = "ACG", + .d = 1, + .expected = {"ACG", "ACT", "ACA", "ACC", "ATG", "AGG", "AAG", "TCG", "GCG", "CCG"}, + }, + { + .text = "AGA", + .d = 0, + .expected = {"AGA"}, + }, + { + .text = "AAA", + .d = 1, + .expected = {"AAA", "AAC", "AAG", "AAT", "ACA", "AGA", "ATA", "CAA", "GAA", "TAA"}, + }, + { + .text = "A", + .d = 1, + .expected = {"A", "C", "T", "G"}, + }, + }; + foreach (t : tests) { + String[] matches = neighbors(t.text, t.d); + assert(matches.len == t.expected.len, "Expected %s, found %s", t.expected, matches); + bool found = false; + foreach (match : matches) { + foreach (exp : t.expected) { + found = true; + } + } + assert(found, "Expected %s, found %s", t.expected, matches); + } +}