commit 87e8f988a6179fc6764a97c86f0053f538b4f006
parent 8e73fc7bf5edfe731b0813c3a4fd862d214c4124
Author: Walther Chen <walther.chen@gmail.com>
Date:   Sat, 26 Oct 2024 23:10:18 -0400

ba1{h,i} approximate pattern {match,count}

Diffstat:
Aba1h.c3 | 24++++++++++++++++++++++++
Aba1i.c3 | 22++++++++++++++++++++++
Mutil.c3 | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 102 insertions(+), 0 deletions(-)

diff --git a/ba1h.c3 b/ba1h.c3 @@ -0,0 +1,24 @@ +// approximate pattern match + +module ba1h; +import std::io; +import std::io::file; +import std::collections; + +import util; + +fn void! main(String[] args) { + if (args.len != 2) { + io::eprintn("Please supply path to data file"); + return IoError.FILE_NOT_FOUND?; + } + File f = file::open(args[1], "rb")!; + String pattern = io::treadline(&f)!; + String genome = io::treadline(&f)!; + String d_str = io::treadline(&f)!; + int d = d_str.to_integer(int)!; + int[] matches = util::approximate_pattern_matching(pattern, genome, d); + foreach (match : matches) { + io::printf("%d ", match); + } +} diff --git a/ba1i.c3 b/ba1i.c3 @@ -0,0 +1,22 @@ +// approximate pattern count + +module ba1i; +import std::io; +import std::io::file; +import std::collections; + +import util; + +fn void! main(String[] args) { + if (args.len != 2) { + io::eprintn("Please supply path to data file"); + return IoError.FILE_NOT_FOUND?; + } + File f = file::open(args[1], "rb")!; + String pattern = io::treadline(&f)!; + String genome = io::treadline(&f)!; + String d_str = io::treadline(&f)!; + int d = d_str.to_integer(int)!; + int[] matches = util::approximate_pattern_matching(pattern, genome, d); + io::printfn("%d", matches.len); +} diff --git a/util.c3 b/util.c3 @@ -65,3 +65,59 @@ fn int hamming_distance(String s1, String s2) { fn void test_hamming_distance() @test { assert(hamming_distance("GGGCCGTTGGT", "GGACCGTTGAC") == 3); } + +// d is the hamming_distance maximum threshold +fn int[] approximate_pattern_matching( + String pattern, + String genome, + int d, + Allocator alloc = allocator::heap()) +{ + if (pattern.len == 0 || genome.len == 0) return {}; + List(<int>) res; + @pool() { + res.temp_init(); + for (int i = 0; i <= genome.len - pattern.len; i += 1) { + // TODO more efficient to break early when calculating hamming distance + if (util::hamming_distance(genome[i:pattern.len], pattern) <= d) { + res.push(i); + } + } + return res.to_new_array(alloc); + }; +} + +struct ApproxMatchTest { + String pattern; + String genome; + int d; + int[] expected; +} +fn void test_approximate_pattern_matching() @test { + ApproxMatchTest[] tests = { + { + .pattern = "ATA", + .genome = "CGATCGAGTACCATAAG", + .d = 1, + .expected = {2, 7, 12}, + }, + { + .pattern = "ATTCTGGA", + .genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT", + .d = 3, + .expected = {6, 7, 26, 27}, + }, + { + .pattern = "AAA", + .genome = "TTTTTTAAATTTTAAATTTTTT", + .d = 2, + .expected = {4, 5, 6, 7, 8, 11, 12, 13, 14, 15}, + }, + }; + foreach (t : tests) { + int[] matches = approximate_pattern_matching(t.pattern, t.genome, t.d); + assert(matches == t.expected, "Expected %s, found %s", t.expected, matches); + } +} + +