commit f541c507cb7bb759ce4d54c73c7a21df5cafb2c3
parent a0484fbc035f101cabb64656aef6ebedfcc361ee
Author: Walther Chen <walther.chen@gmail.com>
Date:   Fri,  1 Nov 2024 11:50:05 -0400

ba1j: frequent words with mismatches and rc

Diffstat:
Mba1c.c3 | 22+++-------------------
Aba1j.c3 | 27+++++++++++++++++++++++++++
Mutil.c3 | 53++++++++++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/ba1c.c3 b/ba1c.c3 @@ -6,6 +6,8 @@ import std::io; import std::io::file; import std::collections; +import util; + fn void! main(String[] args) { if (args.len != 2) { io::eprintn("Please supply path to data file"); @@ -13,24 +15,6 @@ fn void! main(String[] args) { } File f = file::open(args[1], "rb")!; String pattern = io::treadline(&f)!; - io::printfn(reverse_complement(pattern)); + io::printfn(util::reverse_complement(pattern)); } -fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) { - char[] s = allocator::alloc_array(alloc, char, pattern.len); - foreach (i, c : pattern) { - ulong idx = s.len - i - 1; - switch (c) { - case 'A': s[idx] = 'T'; - case 'C': s[idx] = 'G'; - case 'T': s[idx] = 'A'; - case 'G': s[idx] = 'C'; - default: $$unreachable(); - } - } - return (String)s; -} - -fn void test_reverse_complement() @test { - assert(reverse_complement("AAAACCCGGT") == "ACCGGGTTTT"); -} diff --git a/ba1j.c3 b/ba1j.c3 @@ -0,0 +1,27 @@ +// Frequent words with mismatches and rc + +module ba1j; +import std::io; +import std::io::file; +import std::collections; + +import util; + +fn void! main(String[] args) { + if (args.len != 2) { + io::eprintn("Please supply path to data file"); + return IoError.FILE_NOT_FOUND?; + } + File f = file::open(args[1], "rb")!; + String genome = io::treadline(&f)!; + String ints = io::treadline(&f)!; + String[] ints_split = ints.tsplit(" "); + int k = ints_split[0].to_integer(int)!; + int d = ints_split[1].to_integer(int)!; + String[] matches = util::frequent_words_with_mismatches_and_rc(genome, k, d); + foreach (match : matches) { + io::printf("%s ", match); + } +} + + diff --git a/util.c3 b/util.c3 @@ -7,6 +7,25 @@ import std::sort; import test; +fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) { + char[] s = allocator::alloc_array(alloc, char, pattern.len); + foreach (i, c : pattern) { + ulong idx = s.len - i - 1; + switch (c) { + case 'A': s[idx] = 'T'; + case 'C': s[idx] = 'G'; + case 'T': s[idx] = 'A'; + case 'G': s[idx] = 'C'; + default: $$unreachable(); + } + } + return (String)s; +} + +fn void test_reverse_complement() @test { + assert(reverse_complement("AAAACCCGGT") == "ACCGGGTTTT"); +} + def FrequencyTable = HashMap(<String, int>); // counts of kmers in a text. @@ -128,8 +147,17 @@ fn void test_approximate_pattern_matching() @test { // Just like frequent words, except instead of matching just each kmer, // we create a neighbohood(d) for each kmer and that neighborhood is compared against the text -fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) { +fn String[] _frequent_words_with_mismatches( + String text, + int k, + int d, + bool with_rc, + Allocator alloc= allocator::heap()) +{ @pool() { + if (with_rc) { + text = text.concat(reverse_complement(text, allocator::temp()), allocator::temp()); + } FrequencyTable freq_map; freq_map.new_init(allocator: alloc); for (int i = 0; i <= text.len - k; i += 1) { @@ -160,6 +188,13 @@ fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator }; } +fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) { + return _frequent_words_with_mismatches(text, k, d, false, alloc); +} +fn String[] frequent_words_with_mismatches_and_rc(String text, int k, int d, Allocator alloc= allocator::heap()) { + return _frequent_words_with_mismatches(text, k, d, true, alloc); +} + struct FreqWordsMismatchesTest { String text; int k; @@ -198,6 +233,20 @@ fn void test_frequent_words_with_mismatches() @test { test::expect_equal_slices_sorted(t.expected, matches); } } +fn void test_frequent_words_with_mismatches_and_rc() @test { + FreqWordsMismatchesTest[] tests = { + { + .text = "ACGTTGCATGTCGCATGATGCATGAGAGCT", + .k = 4, + .d = 1, + .expected = {"ATGT", "ACAT"}, + }, + }; + foreach (t : tests) { + String[] matches = frequent_words_with_mismatches_and_rc(t.text, t.k, t.d); + test::expect_equal_slices_sorted(t.expected, matches); + } +} // Build neighbors by building kmers over the neighborhood of each suffix. // So basically, start iterating from the last element (as the base suffix) @@ -304,5 +353,3 @@ fn void test_neighbors() @test { test::expect_equal_slices_sorted(t.expected, matches); } } - -