commit 8fc2834808831cd5ff456908ca25d413c55ed65c
parent 1bacb219fa0c7c302aeb98c6a7a33512225e9c56
Author: Walther Chen <walther.chen@gmail.com>
Date: Thu, 24 Oct 2024 10:48:05 -0400
refactor clump_finding
Diffstat:
4 files changed, 64 insertions(+), 38 deletions(-)
diff --git a/ba1e.c3 b/ba1e.c3
@@ -17,45 +17,10 @@ fn void! main(String[] args) {
String ints = io::treadline(&f)!;
String[] ints_split = ints.tsplit(" ");
int k = ints_split[0].to_integer(int)!;
- int region_len = ints_split[1].to_integer(int)!; // K
+ int region_len = ints_split[1].to_integer(int)!; // L
int clump_threshold = ints_split[2].to_integer(int)!; // t
- String[] clumps = clump_finding(genome, k, region_len, clump_threshold);
+ String[] clumps = util::clump_finding(genome, k, region_len, clump_threshold);
foreach (clump : clumps) {
io::printf("%s ", clump);
}
}
-
-fn String[] clump_finding(
- String genome,
- int k,
- int region_len, // K
- int clump_threshold, // t
- Allocator alloc = allocator::heap())
-{
- if (k == 0 || genome.len == 0) return {};
- String[] res;
- @pool() {
- HashMap(<String, char>) clumps; // a set
- clumps.temp_init();
- for (int i = 0; i <= genome.len - region_len; i += 1) {
- FrequencyTable freq_map = util::frequency_table(genome[i:region_len], k);
- freq_map.@each(; String kmer, int count) {
- if (count >= clump_threshold) {
- // TODO can probably just use a growable array on small inputs
- // Also, careful: hashmap copies keys! Here that means keys copied
- // to temp allocator.
- clumps.set(kmer, 0);
- }
- };
- }
- return clumps.copy_keys(alloc);
- };
-}
-
-fn void test_clump_finding() @test {
- String input = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA";
- assert(clump_finding(input, 5, 50, 4) == {"CGACA", "GAAGA"});
-}
-
-
-
diff --git a/clump_finding.c3 b/clump_finding.c3
@@ -0,0 +1,29 @@
+// clump finding from inputs
+// args are:
+// - genome
+// - k (word length)
+// - L (region length)
+// - t (threshold number for frequent word to qualify as clump)
+
+module clump_finding;
+import std::io;
+import std::io::file;
+import std::collections;
+
+import util;
+
+fn void! main(String[] args) {
+ if (args.len != 5) {
+ io::eprintn("Please supply path, k, L, t to data file");
+ return IoError.FILE_NOT_FOUND?;
+ }
+ File f = file::open(args[1], "rb")!;
+ defer (void)f.close();
+ String genome = (String)io::read_new_fully(&f)!;
+ int k = args[2].to_integer(int)!;
+ int region_len = args[3].to_integer(int)!; // L
+ int clump_threshold = args[4].to_integer(int)!; // t
+ String[] clumps = util::clump_finding(genome, k, region_len, clump_threshold);
+ io::printfn("%s ", clumps.len);
+}
+
diff --git a/justfile b/justfile
@@ -10,4 +10,4 @@
# using compile-run prints a bunch of logs
run problem *args="":
- just build {{problem}} -O3 && ./{{problem}} {{args}} && rm ./{{problem}}
+ just build {{problem}} -O3 && time ./{{problem}} {{args}} && rm ./{{problem}}
diff --git a/util.c3 b/util.c3
@@ -16,3 +16,35 @@ fn FrequencyTable frequency_table(String text, int k, Allocator alloc = allocato
}
return kmer_counts;
}
+
+fn String[] clump_finding(
+ String genome,
+ int k,
+ int region_len, // L
+ int clump_threshold, // t
+ Allocator alloc = allocator::heap())
+{
+ if (k == 0 || genome.len == 0) return {};
+ String[] res;
+ @pool() {
+ HashMap(<String, char>) clumps; // a set
+ clumps.temp_init();
+ for (int i = 0; i <= genome.len - region_len; i += 1) {
+ FrequencyTable freq_map = frequency_table(genome[i:region_len], k);
+ freq_map.@each(; String kmer, int count) {
+ if (count >= clump_threshold) {
+ // TODO can probably just use a growable array on small inputs
+ // Also, careful: hashmap copies keys! Here that means keys copied
+ // to temp allocator.
+ clumps.set(kmer, 0);
+ }
+ };
+ }
+ return clumps.copy_keys(alloc);
+ };
+}
+
+fn void test_clump_finding() @test {
+ String input = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA";
+ assert(clump_finding(input, 5, 50, 4) == {"CGACA", "GAAGA"});
+}