commit 87e8f988a6179fc6764a97c86f0053f538b4f006
parent 8e73fc7bf5edfe731b0813c3a4fd862d214c4124
Author: Walther Chen <walther.chen@gmail.com>
Date: Sat, 26 Oct 2024 23:10:18 -0400
ba1{h,i} approximate pattern {match,count}
Diffstat:
A | ba1h.c3 | | | 24 | ++++++++++++++++++++++++ |
A | ba1i.c3 | | | 22 | ++++++++++++++++++++++ |
M | util.c3 | | | 56 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 102 insertions(+), 0 deletions(-)
diff --git a/ba1h.c3 b/ba1h.c3
@@ -0,0 +1,24 @@
+// approximate pattern match
+
+module ba1h;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+ if (args.len != 2) {
+ io::eprintn("Please supply path to data file");
+ return IoError.FILE_NOT_FOUND?;
+ }
+ File f = file::open(args[1], "rb")!;
+ String pattern = io::treadline(&f)!;
+ String genome = io::treadline(&f)!;
+ String d_str = io::treadline(&f)!;
+ int d = d_str.to_integer(int)!;
+ int[] matches = util::approximate_pattern_matching(pattern, genome, d);
+ foreach (match : matches) {
+ io::printf("%d ", match);
+ }
+}
diff --git a/ba1i.c3 b/ba1i.c3
@@ -0,0 +1,22 @@
+// approximate pattern count
+
+module ba1i;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+ if (args.len != 2) {
+ io::eprintn("Please supply path to data file");
+ return IoError.FILE_NOT_FOUND?;
+ }
+ File f = file::open(args[1], "rb")!;
+ String pattern = io::treadline(&f)!;
+ String genome = io::treadline(&f)!;
+ String d_str = io::treadline(&f)!;
+ int d = d_str.to_integer(int)!;
+ int[] matches = util::approximate_pattern_matching(pattern, genome, d);
+ io::printfn("%d", matches.len);
+}
diff --git a/util.c3 b/util.c3
@@ -65,3 +65,59 @@ fn int hamming_distance(String s1, String s2) {
fn void test_hamming_distance() @test {
assert(hamming_distance("GGGCCGTTGGT", "GGACCGTTGAC") == 3);
}
+
+// d is the hamming_distance maximum threshold
+fn int[] approximate_pattern_matching(
+ String pattern,
+ String genome,
+ int d,
+ Allocator alloc = allocator::heap())
+{
+ if (pattern.len == 0 || genome.len == 0) return {};
+ List(<int>) res;
+ @pool() {
+ res.temp_init();
+ for (int i = 0; i <= genome.len - pattern.len; i += 1) {
+ // TODO more efficient to break early when calculating hamming distance
+ if (util::hamming_distance(genome[i:pattern.len], pattern) <= d) {
+ res.push(i);
+ }
+ }
+ return res.to_new_array(alloc);
+ };
+}
+
+struct ApproxMatchTest {
+ String pattern;
+ String genome;
+ int d;
+ int[] expected;
+}
+fn void test_approximate_pattern_matching() @test {
+ ApproxMatchTest[] tests = {
+ {
+ .pattern = "ATA",
+ .genome = "CGATCGAGTACCATAAG",
+ .d = 1,
+ .expected = {2, 7, 12},
+ },
+ {
+ .pattern = "ATTCTGGA",
+ .genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT",
+ .d = 3,
+ .expected = {6, 7, 26, 27},
+ },
+ {
+ .pattern = "AAA",
+ .genome = "TTTTTTAAATTTTAAATTTTTT",
+ .d = 2,
+ .expected = {4, 5, 6, 7, 8, 11, 12, 13, 14, 15},
+ },
+ };
+ foreach (t : tests) {
+ int[] matches = approximate_pattern_matching(t.pattern, t.genome, t.d);
+ assert(matches == t.expected, "Expected %s, found %s", t.expected, matches);
+ }
+}
+
+