util-frequency.c3 (10467B) - raw
1 module util; 2 import std::collections; 3 import std::core::mem; 4 import std::core::string; 5 import std::io; 6 import std::sort; 7 8 import test; 9 10 fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) { 11 char[] s = allocator::alloc_array(alloc, char, pattern.len); 12 foreach (i, c : pattern) { 13 ulong idx = s.len - i - 1; 14 switch (c) { 15 case 'A': s[idx] = 'T'; 16 case 'C': s[idx] = 'G'; 17 case 'T': s[idx] = 'A'; 18 case 'G': s[idx] = 'C'; 19 default: $$unreachable(); 20 } 21 } 22 return (String)s; 23 } 24 25 fn void test_reverse_complement() @test { 26 assert(reverse_complement("AAAACCCGGT", tmem) == "ACCGGGTTTT"); 27 } 28 29 alias FrequencyTable = HashMap{String, int}; 30 31 // counts of kmers in a text. 32 // Lifetime of result is same as that of `text` param. 33 fn FrequencyTable frequency_table(String text, int k, Allocator alloc = allocator::heap()) { 34 FrequencyTable kmer_counts; 35 kmer_counts.init(allocator: alloc); 36 for (int i = 0; i <= text.len - k; i += 1) { 37 // TODO get_or_update? Annoying to get twice 38 // Modified std so that hashmap doesn't copy keys, but these 39 // keys lifetime is the same as `text`. 40 String key = text[i:k]; 41 kmer_counts.@get_or_set(key, 0); 42 if (try count = kmer_counts.get_ref(key)) { 43 *count += 1; 44 } 45 } 46 return kmer_counts; 47 } 48 49 // Lifetime of result is that of the `genome` param. 50 fn String[] clump_finding( 51 String genome, 52 int k, 53 int region_len, // L 54 int clump_threshold, // t 55 Allocator alloc = allocator::heap()) 56 { 57 if (k == 0 || genome.len == 0) return {}; 58 HashMap{String, char} clumps; // a set 59 @pool() { 60 clumps.tinit(); 61 for (int i = 0; i <= genome.len - region_len; i += 1) { 62 @pool() { 63 FrequencyTable freq_map = frequency_table(genome[i:region_len], k, tmem); 64 freq_map.@each(; String kmer, int count) { 65 if (count >= clump_threshold) { 66 // TODO can probably just use a growable array on small inputs 67 // std hashmap modified to not copy keys, lifetime of kmer is that of 68 // the `genome` param 69 clumps.set(kmer, 0); 70 } 71 }; 72 }; 73 } 74 return clumps.keys(alloc); 75 }; 76 } 77 78 fn void test_clump_finding() @test { 79 String input = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA"; 80 assert(clump_finding(input, 5, 50, 4, tmem) == {"CGACA", "GAAGA"}); 81 } 82 83 fn int hamming_distance(String s1, String s2) { 84 assert(s1.len == s2.len); 85 int count = 0; 86 for (int i = 0; i < s1.len; i += 1) { 87 if (s1[i] != s2[i]) { 88 count += 1; 89 } 90 } 91 return count; 92 } 93 94 fn void test_hamming_distance() @test { 95 assert(hamming_distance("GGGCCGTTGGT", "GGACCGTTGAC") == 3); 96 } 97 98 // d is the hamming_distance maximum threshold 99 fn int[] approximate_pattern_matching( 100 String pattern, 101 String genome, 102 int d, 103 Allocator alloc = allocator::heap()) 104 { 105 if (pattern.len == 0 || genome.len == 0) return {}; 106 List{int} res; 107 @pool() { 108 res.tinit(); 109 for (int i = 0; i <= genome.len - pattern.len; i += 1) { 110 // TODO more efficient to break early when calculating hamming distance 111 if (util::hamming_distance(genome[i:pattern.len], pattern) <= d) { 112 res.push(i); 113 } 114 } 115 return res.to_array(alloc); 116 }; 117 } 118 119 struct ApproxMatchTest { 120 String pattern; 121 String genome; 122 int d; 123 int[] expected; 124 } 125 fn void test_approximate_pattern_matching() @test { 126 ApproxMatchTest[] tests = { 127 { 128 .pattern = "ATA", 129 .genome = "CGATCGAGTACCATAAG", 130 .d = 1, 131 .expected = {2, 7, 12}, 132 }, 133 { 134 .pattern = "ATTCTGGA", 135 .genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT", 136 .d = 3, 137 .expected = {6, 7, 26, 27}, 138 }, 139 { 140 .pattern = "AAA", 141 .genome = "TTTTTTAAATTTTAAATTTTTT", 142 .d = 2, 143 .expected = {4, 5, 6, 7, 8, 11, 12, 13, 14, 15}, 144 }, 145 }; 146 foreach (t : tests) { 147 int[] matches = approximate_pattern_matching(t.pattern, t.genome, t.d, tmem); 148 assert(matches == t.expected, "Expected %s, found %s", t.expected, matches); 149 } 150 } 151 152 // Just like frequent words, except instead of matching just each kmer, 153 // we create a neighbohood(d) for each kmer and that neighborhood is compared against the text 154 fn String[] _frequent_words_with_mismatches( 155 String text, 156 int k, 157 int d, 158 bool with_rc, 159 Allocator alloc= allocator::heap()) 160 { 161 @pool() { 162 if (with_rc) { 163 text = text.tconcat(reverse_complement(text, allocator::temp())); 164 } 165 FrequencyTable freq_map; 166 freq_map.init(allocator: alloc); 167 for (int i = 0; i <= text.len - k; i += 1) { 168 String pattern = text[i:k]; 169 String[] neighborhood = neighbors(pattern, d, allocator::temp()); 170 foreach (neighbor : neighborhood) { 171 freq_map.@get_or_set(neighbor, 0); 172 if (try int* count = freq_map.get_ref(neighbor)) { 173 *count += 1; 174 } 175 } 176 } 177 178 int[] counts = freq_map.tvalues(); 179 int max = 0; 180 freq_map.@each(; String _k, int count) { 181 if (count > max) max = count; 182 }; 183 184 List{String} res; 185 res.tinit(); 186 freq_map.@each(; String key, int v) { 187 if (v == max) { 188 res.push(key.copy(alloc)); 189 } 190 }; 191 return res.to_array(alloc); 192 }; 193 } 194 195 fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) { 196 return _frequent_words_with_mismatches(text, k, d, false, alloc); 197 } 198 fn String[] frequent_words_with_mismatches_and_rc(String text, int k, int d, Allocator alloc= allocator::heap()) { 199 return _frequent_words_with_mismatches(text, k, d, true, alloc); 200 } 201 202 struct FreqWordsMismatchesTest { 203 String text; 204 int k; 205 int d; 206 String[] expected; 207 } 208 fn void test_frequent_words_with_mismatches() @test { 209 FreqWordsMismatchesTest[] tests = { 210 { 211 .text = "ACGTTGCATGTCGCATGATGCATGAGAGCT", 212 .k = 4, 213 .d = 1, 214 .expected = {"ATGT", "GATG", "ATGC"}, 215 }, 216 { 217 .text = "AGGT", 218 .k = 2, 219 .d = 1, 220 .expected = {"GG"}, 221 }, 222 { 223 .text = "AGGGT", 224 .k = 2, 225 .d = 0, 226 .expected = {"GG"}, 227 }, 228 { 229 .text = "AGGCGG", 230 .k = 3, 231 .d = 0, 232 .expected = {"AGG", "GGC", "GCG", "CGG"}, 233 }, 234 }; 235 foreach (t : tests) { 236 @pool() { 237 String[] matches = frequent_words_with_mismatches(t.text, t.k, t.d, tmem); 238 test::expect_equal_slices_sorted(t.expected, matches); 239 }; 240 } 241 } 242 fn void test_frequent_words_with_mismatches_and_rc() @test { 243 FreqWordsMismatchesTest[] tests = { 244 { 245 .text = "ACGTTGCATGTCGCATGATGCATGAGAGCT", 246 .k = 4, 247 .d = 1, 248 .expected = {"ATGT", "ACAT"}, 249 }, 250 }; 251 foreach (t : tests) { 252 String[] matches = frequent_words_with_mismatches_and_rc(t.text, t.k, t.d, tmem); 253 test::expect_equal_slices_sorted(t.expected, matches); 254 } 255 } 256 257 // Build neighbors by building kmers over the neighborhood of each suffix. 258 // So basically, start iterating from the last element (as the base suffix) 259 // neighborhood, and build up kmers within d using the previous neighborhood + 260 // a prefix element. 261 fn String[] neighbors(String pattern, int d, Allocator alloc = allocator::heap()) 262 { 263 if (d == 0) return {pattern}; 264 if (pattern.len == 0) return {}; 265 @pool() { 266 DynamicArenaAllocator neighborhood_arena; 267 DynamicArenaAllocator suffix_neighborhood_arena; 268 neighborhood_arena.init(tmem, 1024); 269 suffix_neighborhood_arena.init(tmem, 1024); 270 List{String} neighborhood; 271 List{String} suffix_neighborhood; 272 neighborhood.init_with_array(&neighborhood_arena, {"A", "C", "G", "T"}); 273 suffix_neighborhood.init(&suffix_neighborhood_arena, 16); 274 for (int i = 1; i < pattern.len; i += 1) { 275 // swap suffix_neighborhood and neighborhood. The previous 276 // neighborhood becomes the new suffix_neighborhood to iterate 277 // over. Now we need a place to collect the next neighborhood into, 278 // so we reuse the memory from the old suffix_neighborhood, which is 279 // no longer useful for future calculation, as its results have already 280 // been moved into the old neighborhood/new suffix_neighborhood as 281 // calculated during the last iteration. (otherwise we have to keep 282 // allocating new memory to collect the next neighborhood into). 283 // 284 // The memory itself is cleared for reuse by the arena holding the 285 // List (see next swap) 286 List{String} swap_var; 287 swap_var = suffix_neighborhood; 288 suffix_neighborhood = neighborhood; 289 neighborhood = swap_var; 290 291 // Same swap with the arenas 292 // Old strings and the List holding them all get cleared 293 DynamicArenaAllocator swap_arena; 294 swap_arena = suffix_neighborhood_arena; 295 suffix_neighborhood_arena = neighborhood_arena; 296 neighborhood_arena = swap_arena; 297 298 // clear out the old neighborhood for use in next iteration 299 neighborhood_arena.free(); 300 neighborhood.init(&neighborhood_arena, suffix_neighborhood.len(),); 301 302 int suffix_idx = pattern.len - i; 303 String suffix = pattern[suffix_idx..]; 304 //io::printfn("%s, suffix_neighborhood = %s, neighborhood = %s", suffix, suffix_neighborhood, neighborhood); 305 foreach (suffix_neighbor : suffix_neighborhood) { 306 //io::printfn("%s, %s, %s, %s", suffix, suffix_neighbor, suffix_neighborhood, neighborhood); 307 if (hamming_distance(suffix, suffix_neighbor) < d) { 308 // adding the additional base will be at most hamming dist == d, 309 neighborhood.push("A".concat(&suffix_neighborhood_arena, suffix_neighbor)); 310 neighborhood.push("C".concat(&suffix_neighborhood_arena, suffix_neighbor)); 311 neighborhood.push("G".concat(&suffix_neighborhood_arena, suffix_neighbor)); 312 neighborhood.push("T".concat(&suffix_neighborhood_arena, suffix_neighbor)); 313 } else { 314 // hamming distance == d. It can't be more, because the neighbors are 315 // generated up to hamming distance d. Adding the first symbol from the 316 // original pattern cannot increase the hamming distance. 317 neighborhood.push(pattern[suffix_idx - 1:1].concat(&suffix_neighborhood_arena, suffix_neighbor,)); 318 } 319 } 320 } 321 322 // Copy strings out to final result's allocator 323 String[] res = allocator::new_array(alloc, String, neighborhood.len()); 324 foreach (i, neighbor : neighborhood) { 325 res[i] = neighbor.copy(alloc); 326 } 327 return res; 328 }; 329 } 330 331 struct NeighborsTest { 332 String text; 333 int d; 334 String[] expected; 335 } 336 fn void test_neighbors() @test { 337 NeighborsTest[] tests = { 338 { 339 .text = "ACG", 340 .d = 1, 341 .expected = {"ACG", "ACT", "ACA", "ACC", "ATG", "AGG", "AAG", "TCG", "GCG", "CCG"}, 342 }, 343 { 344 .text = "AGA", 345 .d = 0, 346 .expected = {"AGA"}, 347 }, 348 { 349 .text = "AAA", 350 .d = 1, 351 .expected = {"AAA", "AAC", "AAG", "AAT", "ACA", "AGA", "ATA", "CAA", "GAA", "TAA"}, 352 }, 353 { 354 .text = "A", 355 .d = 1, 356 .expected = {"A", "C", "T", "G"}, 357 }, 358 }; 359 foreach (t : tests) { 360 String[] matches = neighbors(t.text, t.d, tmem); 361 test::expect_equal_slices_sorted(t.expected, matches); 362 } 363 }