commit f541c507cb7bb759ce4d54c73c7a21df5cafb2c3
parent a0484fbc035f101cabb64656aef6ebedfcc361ee
Author: Walther Chen <walther.chen@gmail.com>
Date: Fri, 1 Nov 2024 11:50:05 -0400
ba1j: frequent words with mismatches and rc
Diffstat:
M | ba1c.c3 | | | 22 | +++------------------- |
A | ba1j.c3 | | | 27 | +++++++++++++++++++++++++++ |
M | util.c3 | | | 53 | ++++++++++++++++++++++++++++++++++++++++++++++++++--- |
3 files changed, 80 insertions(+), 22 deletions(-)
diff --git a/ba1c.c3 b/ba1c.c3
@@ -6,6 +6,8 @@ import std::io;
import std::io::file;
import std::collections;
+import util;
+
fn void! main(String[] args) {
if (args.len != 2) {
io::eprintn("Please supply path to data file");
@@ -13,24 +15,6 @@ fn void! main(String[] args) {
}
File f = file::open(args[1], "rb")!;
String pattern = io::treadline(&f)!;
- io::printfn(reverse_complement(pattern));
+ io::printfn(util::reverse_complement(pattern));
}
-fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) {
- char[] s = allocator::alloc_array(alloc, char, pattern.len);
- foreach (i, c : pattern) {
- ulong idx = s.len - i - 1;
- switch (c) {
- case 'A': s[idx] = 'T';
- case 'C': s[idx] = 'G';
- case 'T': s[idx] = 'A';
- case 'G': s[idx] = 'C';
- default: $$unreachable();
- }
- }
- return (String)s;
-}
-
-fn void test_reverse_complement() @test {
- assert(reverse_complement("AAAACCCGGT") == "ACCGGGTTTT");
-}
diff --git a/ba1j.c3 b/ba1j.c3
@@ -0,0 +1,27 @@
+// Frequent words with mismatches and rc
+
+module ba1j;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+ if (args.len != 2) {
+ io::eprintn("Please supply path to data file");
+ return IoError.FILE_NOT_FOUND?;
+ }
+ File f = file::open(args[1], "rb")!;
+ String genome = io::treadline(&f)!;
+ String ints = io::treadline(&f)!;
+ String[] ints_split = ints.tsplit(" ");
+ int k = ints_split[0].to_integer(int)!;
+ int d = ints_split[1].to_integer(int)!;
+ String[] matches = util::frequent_words_with_mismatches_and_rc(genome, k, d);
+ foreach (match : matches) {
+ io::printf("%s ", match);
+ }
+}
+
+
diff --git a/util.c3 b/util.c3
@@ -7,6 +7,25 @@ import std::sort;
import test;
+fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) {
+ char[] s = allocator::alloc_array(alloc, char, pattern.len);
+ foreach (i, c : pattern) {
+ ulong idx = s.len - i - 1;
+ switch (c) {
+ case 'A': s[idx] = 'T';
+ case 'C': s[idx] = 'G';
+ case 'T': s[idx] = 'A';
+ case 'G': s[idx] = 'C';
+ default: $$unreachable();
+ }
+ }
+ return (String)s;
+}
+
+fn void test_reverse_complement() @test {
+ assert(reverse_complement("AAAACCCGGT") == "ACCGGGTTTT");
+}
+
def FrequencyTable = HashMap(<String, int>);
// counts of kmers in a text.
@@ -128,8 +147,17 @@ fn void test_approximate_pattern_matching() @test {
// Just like frequent words, except instead of matching just each kmer,
// we create a neighbohood(d) for each kmer and that neighborhood is compared against the text
-fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
+fn String[] _frequent_words_with_mismatches(
+ String text,
+ int k,
+ int d,
+ bool with_rc,
+ Allocator alloc= allocator::heap())
+{
@pool() {
+ if (with_rc) {
+ text = text.concat(reverse_complement(text, allocator::temp()), allocator::temp());
+ }
FrequencyTable freq_map;
freq_map.new_init(allocator: alloc);
for (int i = 0; i <= text.len - k; i += 1) {
@@ -160,6 +188,13 @@ fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator
};
}
+fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
+ return _frequent_words_with_mismatches(text, k, d, false, alloc);
+}
+fn String[] frequent_words_with_mismatches_and_rc(String text, int k, int d, Allocator alloc= allocator::heap()) {
+ return _frequent_words_with_mismatches(text, k, d, true, alloc);
+}
+
struct FreqWordsMismatchesTest {
String text;
int k;
@@ -198,6 +233,20 @@ fn void test_frequent_words_with_mismatches() @test {
test::expect_equal_slices_sorted(t.expected, matches);
}
}
+fn void test_frequent_words_with_mismatches_and_rc() @test {
+ FreqWordsMismatchesTest[] tests = {
+ {
+ .text = "ACGTTGCATGTCGCATGATGCATGAGAGCT",
+ .k = 4,
+ .d = 1,
+ .expected = {"ATGT", "ACAT"},
+ },
+ };
+ foreach (t : tests) {
+ String[] matches = frequent_words_with_mismatches_and_rc(t.text, t.k, t.d);
+ test::expect_equal_slices_sorted(t.expected, matches);
+ }
+}
// Build neighbors by building kmers over the neighborhood of each suffix.
// So basically, start iterating from the last element (as the base suffix)
@@ -304,5 +353,3 @@ fn void test_neighbors() @test {
test::expect_equal_slices_sorted(t.expected, matches);
}
}
-
-