ba1j, ba1l: frequent_words_with_mismatch, neighbors - bioinformatics

commit 6eb4dd815927cb9bd913a744c2a9574360cd45ec
parent 87e8f988a6179fc6764a97c86f0053f538b4f006
Author: Walther Chen <walther.chen@gmail.com>
Date:   Fri,  1 Nov 2024 09:22:17 -0400

ba1j, ba1l: frequent_words_with_mismatch, neighbors

Diffstat:
A ba1j.c3  | 26 ++++++++++++++++++++++++++
A ba1l.c3  | 24 ++++++++++++++++++++++++
M util.c3  | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 227 insertions(+), 0 deletions(-)
diff --git a/ba1j.c3 b/ba1j.c3
@@ -0,0 +1,26 @@
+// Frequent words with mismatches
+
+module ba1j;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+	if (args.len != 2) {
+		io::eprintn("Please supply path to data file");
+		return IoError.FILE_NOT_FOUND?;
+	}
+	File f = file::open(args[1], "rb")!;
+	String genome = io::treadline(&f)!;
+	String ints = io::treadline(&f)!;
+	String[] ints_split = ints.tsplit(" ");
+	int k = ints_split[0].to_integer(int)!;
+	int d = ints_split[1].to_integer(int)!;
+	String[] matches = util::frequent_words_with_mismatches(genome, k, d);
+	foreach (match : matches) {
+		io::printf("%s ", match);
+	}
+}
+
diff --git a/ba1l.c3 b/ba1l.c3
@@ -0,0 +1,24 @@
+// Neighbors
+
+module ba1l;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+	if (args.len != 2) {
+		io::eprintn("Please supply path to data file");
+		return IoError.FILE_NOT_FOUND?;
+	}
+	File f = file::open(args[1], "rb")!;
+	String pattern = io::treadline(&f)!;
+	int d = io::treadline(&f)!.to_integer(int)!;
+	String[] matches = util::neighbors(pattern, d);
+	foreach (match : matches) {
+		io::printf("%s ", match);
+	}
+}
+
+
diff --git a/util.c3 b/util.c3
@@ -1,5 +1,8 @@
 module util;
 import std::collections;
+import std::core::string;
+import std::io;
+import std::sort;
 
 def FrequencyTable = HashMap(<String, int>);
 
@@ -120,4 +123,178 @@ fn void test_approximate_pattern_matching() @test {
 	}
 }
 
+// Just like frequent words, except instead of matching just each kmer,
+// we create a neighbohood(d) for each kmer and that neighborhood is compared against the text
+fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
+	@pool() {
+		FrequencyTable freq_map;
+		freq_map.new_init(allocator: alloc);
+		for (int i = 0; i <= text.len - k; i += 1) {
+			String pattern = text[i:k];
+			String[] neighborhood = neighbors(pattern, d, allocator::temp());
+			foreach (neighbor : neighborhood) {
+				freq_map.@get_or_set(neighbor, 0); // don't need to clone, keys copied into table
+				if (try int* count = freq_map.get_ref(neighbor)) {
+					*count += 1;
+				}
+			}
+		}
+
+		int[] counts = freq_map.value_tlist();
+		int max = 0;
+		freq_map.@each(; String _k, int count) {
+			if (count > max) max = count;
+		};
+
+		List(<String>) res;
+		res.temp_init();
+		freq_map.@each(; String key, int v) {
+			if (v == max) {
+				res.push(key.copy(alloc));
+			}
+		};
+		return res.to_new_array(alloc);
+	};
+}
+
+struct FreqWordsMismatchesTest {
+	String text;
+	int k;
+	int d;
+	String[] expected;
+}
+fn void test_frequent_words_with_mismatches() @test {
+	FreqWordsMismatchesTest[] tests = {
+		{
+			.text = "ACGTTGCATGTCGCATGATGCATGAGAGCT",
+			.k = 4,
+			.d = 1,
+			.expected = {"ATGT", "GATG", "ATGC"},
+		},
+		{
+			.text = "AGGT",
+			.k = 2,
+			.d = 1,
+			.expected = {"GG"},
+		},
+		{
+			.text = "AGGGT",
+			.k = 2,
+			.d = 0,
+			.expected = {"GG"},
+		},
+		{
+			.text = "AGGCGG",
+			.k = 3,
+			.d = 0,
+			.expected = {"AGG", "GGC", "GCG", "CGG"},
+		},
+	};
+	foreach (t : tests) {
+		String[] matches = frequent_words_with_mismatches(t.text, t.k, t.d);
+		assert(matches.len == t.expected.len, "Expected %s, found %s", t.expected, matches);
+		bool found = false;
+		foreach (match : matches) {
+			foreach (exp : t.expected) {
+				found = true;
+			}
+		}
+		assert(found, "Expected %s, found %s", t.expected, matches);
+	}
+}
+
+// Build neighbors by building kmers over the neighborhood of each suffix.
+// So basically, start iterating from the last element (as the base suffix)
+// neighborhood, and build up kmers within d using the previous neighborhood +
+// a prefix element.
+fn String[] neighbors(String pattern, int d, Allocator alloc = allocator::heap())
+{
+	if (d == 0) return {pattern};
+	if (pattern.len == 0) return {};
+	List(<String>) neighborhood;
+	neighborhood.temp_init_with_array({"A", "C", "G", "T"});
+	List(<String>) suffix_neighborhood;
+	suffix_neighborhood.temp_init();
+	for (int i = 1; i < pattern.len; i += 1) {
+		// swap suffix_neighborhood and neighborhood. The previous
+		// neighborhood becomes the new suffix_neighborhood to iterate
+		// over. Now we need a place to collect the next neighborhood into,
+		// so we reuse the memory from the old suffix_neighborhood, which is
+		// no longer useful for future calculation, as its results have already
+		// been moved into the old neighborhood/new suffix_neighborhood as
+		// calculated during the last iteration. (otherwise we have to keep
+		// allocating new memory to collect the next neighborhood into).
+		List(<String>) swap_var;
+		swap_var = suffix_neighborhood;
+		suffix_neighborhood = neighborhood;
+		neighborhood = swap_var;
+		neighborhood.clear();
+
+		int suffix_idx = pattern.len - i;
+		String suffix = pattern[suffix_idx..];
+		//io::printfn("%s, suffix_neighborhood = %s, neighborhood = %s", suffix, suffix_neighborhood, neighborhood);
+		foreach (suffix_neighbor : suffix_neighborhood) {
+			//io::printfn("%s, %s, %s, %s", suffix, suffix_neighbor, suffix_neighborhood, neighborhood);
+			if (hamming_distance(suffix, suffix_neighbor) < d) {
+				// adding the additional base will be at most hamming dist == d,
+				// TODO we can reuse the memory of neighborhoods by swapping, can we
+				// do something about all the new strings?
+				neighborhood.push("A".tconcat(suffix_neighbor));
+				neighborhood.push("C".tconcat(suffix_neighbor));
+				neighborhood.push("G".tconcat(suffix_neighbor));
+				neighborhood.push("T".tconcat(suffix_neighbor));
+			} else {
+				// hamming distance == d. It can't be more, because the neighbors are
+				// generated up to hamming distance d. Adding the first symbol from the
+				// original pattern cannot increase the hamming distance.
+				neighborhood.push(pattern[suffix_idx - 1:1].tconcat(suffix_neighbor));
+			}
+		}
+	}
+	String[] res = allocator::new_array(alloc, String, neighborhood.len());
+	foreach (i, neighbor : neighborhood) {
+		res[i] = neighbor.copy(alloc);
+	}
+	return res;
+}
 
+struct NeighborsTest {
+	String text;
+	int d;
+	String[] expected;
+}
+fn void test_neighbors() @test {
+	NeighborsTest[] tests = {
+		{
+			.text = "ACG",
+			.d = 1,
+			.expected = {"ACG", "ACT", "ACA", "ACC", "ATG", "AGG", "AAG", "TCG", "GCG", "CCG"},
+		},
+		{
+			.text = "AGA",
+			.d = 0,
+			.expected = {"AGA"},
+		},
+		{
+			.text = "AAA",
+			.d = 1,
+			.expected = {"AAA", "AAC", "AAG", "AAT", "ACA", "AGA", "ATA", "CAA", "GAA", "TAA"},
+		},
+		{
+			.text = "A",
+			.d = 1,
+			.expected = {"A", "C", "T", "G"},
+		},
+	};
+	foreach (t : tests) {
+		String[] matches = neighbors(t.text, t.d);
+		assert(matches.len == t.expected.len, "Expected %s, found %s", t.expected, matches);
+		bool found = false;
+		foreach (match : matches) {
+			foreach (exp : t.expected) {
+				found = true;
+			}
+		}
+		assert(found, "Expected %s, found %s", t.expected, matches);
+	}
+}

A	ba1j.c3	\|	26	++++++++++++++++++++++++++
A	ba1l.c3	\|	24	++++++++++++++++++++++++
M	util.c3	\|	177	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++