ba1j: frequent words with mismatches and rc - bioinformatics

commit f541c507cb7bb759ce4d54c73c7a21df5cafb2c3
parent a0484fbc035f101cabb64656aef6ebedfcc361ee
Author: Walther Chen <walther.chen@gmail.com>
Date:   Fri,  1 Nov 2024 11:50:05 -0400

ba1j: frequent words with mismatches and rc

Diffstat:
M ba1c.c3  | 22 +++-------------------
A ba1j.c3  | 27 +++++++++++++++++++++++++++
M util.c3  | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---

3 files changed, 80 insertions(+), 22 deletions(-)
diff --git a/ba1c.c3 b/ba1c.c3
@@ -6,6 +6,8 @@ import std::io;
 import std::io::file;
 import std::collections;
 
+import util;
+
 fn void! main(String[] args) {
 	if (args.len != 2) {
 		io::eprintn("Please supply path to data file");
@@ -13,24 +15,6 @@ fn void! main(String[] args) {
 	}
 	File f = file::open(args[1], "rb")!;
 	String pattern = io::treadline(&f)!;
-	io::printfn(reverse_complement(pattern));
+	io::printfn(util::reverse_complement(pattern));
 }
 
-fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) {
-	char[] s = allocator::alloc_array(alloc, char, pattern.len);
-	foreach (i, c : pattern) {
-		ulong idx = s.len - i - 1;
-		switch (c) {
-			case 'A': s[idx] = 'T';
-			case 'C': s[idx] = 'G';
-			case 'T': s[idx] = 'A';
-			case 'G': s[idx] = 'C';
-			default: $$unreachable();
-		}
-	}
-	return (String)s;
-}
-
-fn void test_reverse_complement() @test {
-	assert(reverse_complement("AAAACCCGGT") == "ACCGGGTTTT");
-}
diff --git a/ba1j.c3 b/ba1j.c3
@@ -0,0 +1,27 @@
+// Frequent words with mismatches and rc
+
+module ba1j;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+	if (args.len != 2) {
+		io::eprintn("Please supply path to data file");
+		return IoError.FILE_NOT_FOUND?;
+	}
+	File f = file::open(args[1], "rb")!;
+	String genome = io::treadline(&f)!;
+	String ints = io::treadline(&f)!;
+	String[] ints_split = ints.tsplit(" ");
+	int k = ints_split[0].to_integer(int)!;
+	int d = ints_split[1].to_integer(int)!;
+	String[] matches = util::frequent_words_with_mismatches_and_rc(genome, k, d);
+	foreach (match : matches) {
+		io::printf("%s ", match);
+	}
+}
+
+
diff --git a/util.c3 b/util.c3
@@ -7,6 +7,25 @@ import std::sort;
 
 import test;
 
+fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) {
+	char[] s = allocator::alloc_array(alloc, char, pattern.len);
+	foreach (i, c : pattern) {
+		ulong idx = s.len - i - 1;
+		switch (c) {
+			case 'A': s[idx] = 'T';
+			case 'C': s[idx] = 'G';
+			case 'T': s[idx] = 'A';
+			case 'G': s[idx] = 'C';
+			default: $$unreachable();
+		}
+	}
+	return (String)s;
+}
+
+fn void test_reverse_complement() @test {
+	assert(reverse_complement("AAAACCCGGT") == "ACCGGGTTTT");
+}
+
 def FrequencyTable = HashMap(<String, int>);
 
 // counts of kmers in a text.
@@ -128,8 +147,17 @@ fn void test_approximate_pattern_matching() @test {
 
 // Just like frequent words, except instead of matching just each kmer,
 // we create a neighbohood(d) for each kmer and that neighborhood is compared against the text
-fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
+fn String[] _frequent_words_with_mismatches(
+	String text,
+	int k,
+	int d,
+	bool with_rc,
+	Allocator alloc= allocator::heap())
+{
 	@pool() {
+		if (with_rc) {
+			text = text.concat(reverse_complement(text, allocator::temp()), allocator::temp());
+		}
 		FrequencyTable freq_map;
 		freq_map.new_init(allocator: alloc);
 		for (int i = 0; i <= text.len - k; i += 1) {
@@ -160,6 +188,13 @@ fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator 
 	};
 }
 
+fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
+	return _frequent_words_with_mismatches(text, k, d, false, alloc);
+}
+fn String[] frequent_words_with_mismatches_and_rc(String text, int k, int d, Allocator alloc= allocator::heap()) {
+	return _frequent_words_with_mismatches(text, k, d, true, alloc);
+}
+
 struct FreqWordsMismatchesTest {
 	String text;
 	int k;
@@ -198,6 +233,20 @@ fn void test_frequent_words_with_mismatches() @test {
 		test::expect_equal_slices_sorted(t.expected, matches);
 	}
 }
+fn void test_frequent_words_with_mismatches_and_rc() @test {
+	FreqWordsMismatchesTest[] tests = {
+		{
+			.text = "ACGTTGCATGTCGCATGATGCATGAGAGCT",
+			.k = 4,
+			.d = 1,
+			.expected = {"ATGT", "ACAT"},
+		},
+	};
+	foreach (t : tests) {
+		String[] matches = frequent_words_with_mismatches_and_rc(t.text, t.k, t.d);
+		test::expect_equal_slices_sorted(t.expected, matches);
+	}
+}
 
 // Build neighbors by building kmers over the neighborhood of each suffix.
 // So basically, start iterating from the last element (as the base suffix)
@@ -304,5 +353,3 @@ fn void test_neighbors() @test {
 		test::expect_equal_slices_sorted(t.expected, matches);
 	}
 }
-
-

M	ba1c.c3	\|	22	+++-------------------
A	ba1j.c3	\|	27	+++++++++++++++++++++++++++
M	util.c3	\|	53	++++++++++++++++++++++++++++++++++++++++++++++++++---