util-frequency.c3 (10467B) - raw


      1 module util;
      2 import std::collections;
      3 import std::core::mem;
      4 import std::core::string;
      5 import std::io;
      6 import std::sort;
      7 
      8 import test;
      9 
     10 fn String reverse_complement(String pattern, Allocator alloc = allocator::heap()) {
     11 	char[] s = allocator::alloc_array(alloc, char, pattern.len);
     12 	foreach (i, c : pattern) {
     13 		ulong idx = s.len - i - 1;
     14 		switch (c) {
     15 			case 'A': s[idx] = 'T';
     16 			case 'C': s[idx] = 'G';
     17 			case 'T': s[idx] = 'A';
     18 			case 'G': s[idx] = 'C';
     19 			default: $$unreachable();
     20 		}
     21 	}
     22 	return (String)s;
     23 }
     24 
     25 fn void test_reverse_complement() @test {
     26 	assert(reverse_complement("AAAACCCGGT", tmem) == "ACCGGGTTTT");
     27 }
     28 
     29 alias FrequencyTable = HashMap{String, int};
     30 
     31 // counts of kmers in a text.
     32 // Lifetime of result is same as that of `text` param.
     33 fn FrequencyTable frequency_table(String text, int k, Allocator alloc = allocator::heap()) {
     34 	FrequencyTable kmer_counts;
     35 	kmer_counts.init(allocator: alloc);
     36 	for (int i = 0; i <= text.len - k; i += 1) {
     37 		// TODO get_or_update? Annoying to get twice
     38 		// Modified std so that hashmap doesn't copy keys, but these
     39 		// keys lifetime is the same as `text`.
     40 		String key = text[i:k];
     41 		kmer_counts.@get_or_set(key, 0);
     42 		if (try count = kmer_counts.get_ref(key)) {
     43 			*count += 1;
     44 		}
     45 	}
     46 	return kmer_counts;
     47 }
     48 
     49 // Lifetime of result is that of the `genome` param.
     50 fn String[] clump_finding(
     51 	String genome,
     52 	int k,
     53 	int region_len, // L
     54 	int clump_threshold, // t
     55 	Allocator alloc = allocator::heap())
     56 {
     57 	if (k == 0 || genome.len == 0) return {};
     58 	HashMap{String, char} clumps; // a set
     59 	@pool() {
     60 		clumps.tinit();
     61 		for (int i = 0; i <= genome.len - region_len; i += 1) {
     62 			@pool() {
     63 				FrequencyTable freq_map = frequency_table(genome[i:region_len], k, tmem);
     64 				freq_map.@each(; String kmer, int count) {
     65 					if (count >= clump_threshold) {
     66 						// TODO can probably just use a growable array on small inputs
     67 						// std hashmap modified to not copy keys, lifetime of kmer is that of
     68 						// the `genome` param
     69 						clumps.set(kmer, 0);
     70 					}
     71 				};
     72 			};
     73 		}
     74 		return clumps.keys(alloc);
     75 	};
     76 }
     77 
     78 fn void test_clump_finding() @test {
     79     String input = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA";
     80 	assert(clump_finding(input, 5, 50, 4, tmem) == {"CGACA", "GAAGA"});
     81 }
     82 
     83 fn int hamming_distance(String s1, String s2) {
     84 	assert(s1.len == s2.len);
     85 	int count = 0;
     86 	for (int i = 0; i < s1.len; i += 1) {
     87 		if (s1[i] != s2[i]) {
     88 			count += 1;
     89 		}
     90 	}
     91 	return count;
     92 }
     93 
     94 fn void test_hamming_distance() @test {
     95 	assert(hamming_distance("GGGCCGTTGGT", "GGACCGTTGAC") == 3);
     96 }
     97 
     98 // d is the hamming_distance maximum threshold
     99 fn int[] approximate_pattern_matching(
    100 	String pattern,
    101 	String genome,
    102 	int d,
    103 	Allocator alloc = allocator::heap())
    104 {
    105 	if (pattern.len == 0 || genome.len == 0) return {};
    106 	List{int} res;
    107 	@pool() {
    108 		res.tinit();
    109 		for (int i = 0; i <= genome.len - pattern.len; i += 1) {
    110 			// TODO more efficient to break early when calculating hamming distance
    111 			if (util::hamming_distance(genome[i:pattern.len], pattern) <= d) {
    112 				res.push(i);
    113 			}
    114 		}
    115 		return res.to_array(alloc);
    116 	};
    117 }
    118 
    119 struct ApproxMatchTest {
    120 	String pattern;
    121 	String genome;
    122 	int d;
    123 	int[] expected;
    124 }
    125 fn void test_approximate_pattern_matching() @test {
    126 	ApproxMatchTest[] tests = {
    127 		{
    128 			.pattern = "ATA",
    129 			.genome = "CGATCGAGTACCATAAG",
    130 			.d = 1,
    131 			.expected = {2, 7, 12},
    132 		},
    133 		{
    134 			.pattern = "ATTCTGGA",
    135 			.genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT",
    136 			.d = 3,
    137 			.expected = {6, 7, 26, 27},
    138 		},
    139 		{
    140 			.pattern = "AAA",
    141 			.genome = "TTTTTTAAATTTTAAATTTTTT",
    142 			.d = 2,
    143 			.expected = {4, 5, 6, 7, 8, 11, 12, 13, 14, 15},
    144 		},
    145 	};
    146 	foreach (t : tests) {
    147 		int[] matches = approximate_pattern_matching(t.pattern, t.genome, t.d, tmem);
    148 		assert(matches == t.expected, "Expected %s, found %s", t.expected, matches);
    149 	}
    150 }
    151 
    152 // Just like frequent words, except instead of matching just each kmer,
    153 // we create a neighbohood(d) for each kmer and that neighborhood is compared against the text
    154 fn String[] _frequent_words_with_mismatches(
    155 	String text,
    156 	int k,
    157 	int d,
    158 	bool with_rc,
    159 	Allocator alloc= allocator::heap())
    160 {
    161 	@pool() {
    162 		if (with_rc) {
    163 			text = text.tconcat(reverse_complement(text, allocator::temp()));
    164 		}
    165 		FrequencyTable freq_map;
    166 		freq_map.init(allocator: alloc);
    167 		for (int i = 0; i <= text.len - k; i += 1) {
    168 			String pattern = text[i:k];
    169 			String[] neighborhood = neighbors(pattern, d, allocator::temp());
    170 			foreach (neighbor : neighborhood) {
    171 				freq_map.@get_or_set(neighbor, 0);
    172 				if (try int* count = freq_map.get_ref(neighbor)) {
    173 					*count += 1;
    174 				}
    175 			}
    176 		}
    177 
    178 		int[] counts = freq_map.tvalues();
    179 		int max = 0;
    180 		freq_map.@each(; String _k, int count) {
    181 			if (count > max) max = count;
    182 		};
    183 
    184 		List{String} res;
    185 		res.tinit();
    186 		freq_map.@each(; String key, int v) {
    187 			if (v == max) {
    188 				res.push(key.copy(alloc));
    189 			}
    190 		};
    191 		return res.to_array(alloc);
    192 	};
    193 }
    194 
    195 fn String[] frequent_words_with_mismatches(String text, int k, int d, Allocator alloc= allocator::heap()) {
    196 	return _frequent_words_with_mismatches(text, k, d, false, alloc);
    197 }
    198 fn String[] frequent_words_with_mismatches_and_rc(String text, int k, int d, Allocator alloc= allocator::heap()) {
    199 	return _frequent_words_with_mismatches(text, k, d, true, alloc);
    200 }
    201 
    202 struct FreqWordsMismatchesTest {
    203 	String text;
    204 	int k;
    205 	int d;
    206 	String[] expected;
    207 }
    208 fn void test_frequent_words_with_mismatches() @test {
    209 	FreqWordsMismatchesTest[] tests = {
    210 		{
    211 			.text = "ACGTTGCATGTCGCATGATGCATGAGAGCT",
    212 			.k = 4,
    213 			.d = 1,
    214 			.expected = {"ATGT", "GATG", "ATGC"},
    215 		},
    216 		{
    217 			.text = "AGGT",
    218 			.k = 2,
    219 			.d = 1,
    220 			.expected = {"GG"},
    221 		},
    222 		{
    223 			.text = "AGGGT",
    224 			.k = 2,
    225 			.d = 0,
    226 			.expected = {"GG"},
    227 		},
    228 		{
    229 			.text = "AGGCGG",
    230 			.k = 3,
    231 			.d = 0,
    232 			.expected = {"AGG", "GGC", "GCG", "CGG"},
    233 		},
    234 	};
    235 	foreach (t : tests) {
    236 		@pool() {
    237 			String[] matches = frequent_words_with_mismatches(t.text, t.k, t.d, tmem);
    238 			test::expect_equal_slices_sorted(t.expected, matches);
    239 		};
    240 	}
    241 }
    242 fn void test_frequent_words_with_mismatches_and_rc() @test {
    243 	FreqWordsMismatchesTest[] tests = {
    244 		{
    245 			.text = "ACGTTGCATGTCGCATGATGCATGAGAGCT",
    246 			.k = 4,
    247 			.d = 1,
    248 			.expected = {"ATGT", "ACAT"},
    249 		},
    250 	};
    251 	foreach (t : tests) {
    252 		String[] matches = frequent_words_with_mismatches_and_rc(t.text, t.k, t.d, tmem);
    253 		test::expect_equal_slices_sorted(t.expected, matches);
    254 	}
    255 }
    256 
    257 // Build neighbors by building kmers over the neighborhood of each suffix.
    258 // So basically, start iterating from the last element (as the base suffix)
    259 // neighborhood, and build up kmers within d using the previous neighborhood +
    260 // a prefix element.
    261 fn String[] neighbors(String pattern, int d, Allocator alloc = allocator::heap())
    262 {
    263 	if (d == 0) return {pattern};
    264 	if (pattern.len == 0) return {};
    265 	@pool() {
    266 		DynamicArenaAllocator neighborhood_arena;
    267 		DynamicArenaAllocator suffix_neighborhood_arena;
    268 		neighborhood_arena.init(tmem, 1024);
    269 		suffix_neighborhood_arena.init(tmem, 1024);
    270 		List{String} neighborhood;
    271 		List{String} suffix_neighborhood;
    272 		neighborhood.init_with_array(&neighborhood_arena, {"A", "C", "G", "T"});
    273 		suffix_neighborhood.init(&suffix_neighborhood_arena, 16);
    274 		for (int i = 1; i < pattern.len; i += 1) {
    275 			// swap suffix_neighborhood and neighborhood. The previous
    276 			// neighborhood becomes the new suffix_neighborhood to iterate
    277 			// over. Now we need a place to collect the next neighborhood into,
    278 			// so we reuse the memory from the old suffix_neighborhood, which is
    279 			// no longer useful for future calculation, as its results have already
    280 			// been moved into the old neighborhood/new suffix_neighborhood as
    281 			// calculated during the last iteration. (otherwise we have to keep
    282 			// allocating new memory to collect the next neighborhood into).
    283 			//
    284 			// The memory itself is cleared for reuse by the arena holding the
    285 			// List (see next swap)
    286 			List{String} swap_var;
    287 			swap_var = suffix_neighborhood;
    288 			suffix_neighborhood = neighborhood;
    289 			neighborhood = swap_var;
    290 
    291 			// Same swap with the arenas
    292 			// Old strings and the List holding them all get cleared
    293 			DynamicArenaAllocator swap_arena;
    294 			swap_arena = suffix_neighborhood_arena;
    295 			suffix_neighborhood_arena = neighborhood_arena;
    296 			neighborhood_arena = swap_arena;
    297 
    298 			// clear out the old neighborhood for use in next iteration
    299 			neighborhood_arena.free();
    300 			neighborhood.init(&neighborhood_arena, suffix_neighborhood.len(),);
    301 
    302 			int suffix_idx = pattern.len - i;
    303 			String suffix = pattern[suffix_idx..];
    304 			//io::printfn("%s, suffix_neighborhood = %s, neighborhood = %s", suffix, suffix_neighborhood, neighborhood);
    305 			foreach (suffix_neighbor : suffix_neighborhood) {
    306 				//io::printfn("%s, %s, %s, %s", suffix, suffix_neighbor, suffix_neighborhood, neighborhood);
    307 				if (hamming_distance(suffix, suffix_neighbor) < d) {
    308 					// adding the additional base will be at most hamming dist == d,
    309 					neighborhood.push("A".concat(&suffix_neighborhood_arena, suffix_neighbor));
    310 					neighborhood.push("C".concat(&suffix_neighborhood_arena, suffix_neighbor));
    311 					neighborhood.push("G".concat(&suffix_neighborhood_arena, suffix_neighbor));
    312 					neighborhood.push("T".concat(&suffix_neighborhood_arena, suffix_neighbor));
    313 				} else {
    314 					// hamming distance == d. It can't be more, because the neighbors are
    315 					// generated up to hamming distance d. Adding the first symbol from the
    316 					// original pattern cannot increase the hamming distance.
    317 					neighborhood.push(pattern[suffix_idx - 1:1].concat(&suffix_neighborhood_arena, suffix_neighbor,));
    318 				}
    319 			}
    320 		}
    321 
    322 		// Copy strings out to final result's allocator
    323 		String[] res = allocator::new_array(alloc, String, neighborhood.len());
    324 		foreach (i, neighbor : neighborhood) {
    325 			res[i] = neighbor.copy(alloc);
    326 		}
    327 		return res;
    328 	};
    329 }
    330 
    331 struct NeighborsTest {
    332 	String text;
    333 	int d;
    334 	String[] expected;
    335 }
    336 fn void test_neighbors() @test {
    337 	NeighborsTest[] tests = {
    338 		{
    339 			.text = "ACG",
    340 			.d = 1,
    341 			.expected = {"ACG", "ACT", "ACA", "ACC", "ATG", "AGG", "AAG", "TCG", "GCG", "CCG"},
    342 		},
    343 		{
    344 			.text = "AGA",
    345 			.d = 0,
    346 			.expected = {"AGA"},
    347 		},
    348 		{
    349 			.text = "AAA",
    350 			.d = 1,
    351 			.expected = {"AAA", "AAC", "AAG", "AAT", "ACA", "AGA", "ATA", "CAA", "GAA", "TAA"},
    352 		},
    353 		{
    354 			.text = "A",
    355 			.d = 1,
    356 			.expected = {"A", "C", "T", "G"},
    357 		},
    358 	};
    359 	foreach (t : tests) {
    360 		String[] matches = neighbors(t.text, t.d, tmem);
    361 		test::expect_equal_slices_sorted(t.expected, matches);
    362 	}
    363 }