commit 8fc2834808831cd5ff456908ca25d413c55ed65c
parent 1bacb219fa0c7c302aeb98c6a7a33512225e9c56
Author: Walther Chen <walther.chen@gmail.com>
Date:   Thu, 24 Oct 2024 10:48:05 -0400

refactor clump_finding

Diffstat:
Mba1e.c3 | 39++-------------------------------------
Aclump_finding.c3 | 29+++++++++++++++++++++++++++++
Mjustfile | 2+-
Mutil.c3 | 32++++++++++++++++++++++++++++++++
4 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/ba1e.c3 b/ba1e.c3 @@ -17,45 +17,10 @@ fn void! main(String[] args) { String ints = io::treadline(&f)!; String[] ints_split = ints.tsplit(" "); int k = ints_split[0].to_integer(int)!; - int region_len = ints_split[1].to_integer(int)!; // K + int region_len = ints_split[1].to_integer(int)!; // L int clump_threshold = ints_split[2].to_integer(int)!; // t - String[] clumps = clump_finding(genome, k, region_len, clump_threshold); + String[] clumps = util::clump_finding(genome, k, region_len, clump_threshold); foreach (clump : clumps) { io::printf("%s ", clump); } } - -fn String[] clump_finding( - String genome, - int k, - int region_len, // K - int clump_threshold, // t - Allocator alloc = allocator::heap()) -{ - if (k == 0 || genome.len == 0) return {}; - String[] res; - @pool() { - HashMap(<String, char>) clumps; // a set - clumps.temp_init(); - for (int i = 0; i <= genome.len - region_len; i += 1) { - FrequencyTable freq_map = util::frequency_table(genome[i:region_len], k); - freq_map.@each(; String kmer, int count) { - if (count >= clump_threshold) { - // TODO can probably just use a growable array on small inputs - // Also, careful: hashmap copies keys! Here that means keys copied - // to temp allocator. - clumps.set(kmer, 0); - } - }; - } - return clumps.copy_keys(alloc); - }; -} - -fn void test_clump_finding() @test { - String input = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA"; - assert(clump_finding(input, 5, 50, 4) == {"CGACA", "GAAGA"}); -} - - - diff --git a/clump_finding.c3 b/clump_finding.c3 @@ -0,0 +1,29 @@ +// clump finding from inputs +// args are: +// - genome +// - k (word length) +// - L (region length) +// - t (threshold number for frequent word to qualify as clump) + +module clump_finding; +import std::io; +import std::io::file; +import std::collections; + +import util; + +fn void! main(String[] args) { + if (args.len != 5) { + io::eprintn("Please supply path, k, L, t to data file"); + return IoError.FILE_NOT_FOUND?; + } + File f = file::open(args[1], "rb")!; + defer (void)f.close(); + String genome = (String)io::read_new_fully(&f)!; + int k = args[2].to_integer(int)!; + int region_len = args[3].to_integer(int)!; // L + int clump_threshold = args[4].to_integer(int)!; // t + String[] clumps = util::clump_finding(genome, k, region_len, clump_threshold); + io::printfn("%s ", clumps.len); +} + diff --git a/justfile b/justfile @@ -10,4 +10,4 @@ # using compile-run prints a bunch of logs run problem *args="": - just build {{problem}} -O3 && ./{{problem}} {{args}} && rm ./{{problem}} + just build {{problem}} -O3 && time ./{{problem}} {{args}} && rm ./{{problem}} diff --git a/util.c3 b/util.c3 @@ -16,3 +16,35 @@ fn FrequencyTable frequency_table(String text, int k, Allocator alloc = allocato } return kmer_counts; } + +fn String[] clump_finding( + String genome, + int k, + int region_len, // L + int clump_threshold, // t + Allocator alloc = allocator::heap()) +{ + if (k == 0 || genome.len == 0) return {}; + String[] res; + @pool() { + HashMap(<String, char>) clumps; // a set + clumps.temp_init(); + for (int i = 0; i <= genome.len - region_len; i += 1) { + FrequencyTable freq_map = frequency_table(genome[i:region_len], k); + freq_map.@each(; String kmer, int count) { + if (count >= clump_threshold) { + // TODO can probably just use a growable array on small inputs + // Also, careful: hashmap copies keys! Here that means keys copied + // to temp allocator. + clumps.set(kmer, 0); + } + }; + } + return clumps.copy_keys(alloc); + }; +} + +fn void test_clump_finding() @test { + String input = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA"; + assert(clump_finding(input, 5, 50, 4) == {"CGACA", "GAAGA"}); +}