forked from rampatra/Algorithms-and-Data-Structures-in-Java
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRepeatedDnaSequence.java
89 lines (74 loc) · 3.19 KB
/
RepeatedDnaSequence.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package com.leetcode.hashtables;
import java.util.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
/**
* Level: Medium
* Problem Link: https://leetcode.com/problems/repeated-dna-sequences/submissions/
* Problem Description:
* All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When
* studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
*
* Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
*
* Example:
* Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
* Output: ["AAAAACCCCC", "CCCCCAAAAA"]
*
* TODO: Figure another method which would have a better runtime.
*
* @author rampatra
* @since 2019-07-29
*/
public class RepeatedDnaSequence {
/**
* Rabin-Karp Algorithm: https://brilliant.org/wiki/rabin-karp-algorithm/
* Following Rabin-Karp's approach let's you avoid spurious hits (worst case scenario) but once the hash matches,
* you will have to compare and check the string you're searching. I tried to just rely on the hash and few test
* cases failed for me (https://leetcode.com/submissions/detail/247342702/).
* <p>
* Time Complexity:
* Space Complexity:
* Runtime: <a href="https://leetcode.com/submissions/detail/247343438/">38 ms</a>.
*
* @param s
* @return
*/
public static List<String> findRepeatedDnaSequences(String s) {
if (s.length() < 10) return new ArrayList<>();
Set<String> repeatedSequences = new HashSet<>();
Map<Long, Set<String>> hashToStringMap = new HashMap<>();
long hashOfSequence = computeHash(s);
hashToStringMap.put(hashOfSequence, new HashSet<String>() {{
add(s.substring(0, 10));
}});
long pow = (long) Math.pow(4, 9);
for (int i = 10; i < s.length(); i++) {
hashOfSequence = (hashOfSequence - (pow * (s.charAt(i - 10) - 'A'))) * 4 + (s.charAt(i) - 'A');
String subString = s.substring(i - 10 + 1, i + 1);
if (hashToStringMap.get(hashOfSequence) != null && hashToStringMap.get(hashOfSequence).contains(subString)) {
repeatedSequences.add(subString);
continue;
}
hashToStringMap.putIfAbsent(hashOfSequence, new HashSet<>());
hashToStringMap.get(hashOfSequence).add(subString);
}
return new ArrayList<>(repeatedSequences);
}
private static long computeHash(String s) {
long hash = 0;
for (int i = 0; i < 10; i++) {
hash += (Math.pow(4, i) * (s.charAt(9 - i) - 'A'));
}
return hash;
}
public static void main(String[] args) {
assertEquals(new ArrayList<>(),
findRepeatedDnaSequences("AAAAACCC"));
assertEquals(Arrays.asList("AAAAACCCCC", "CCCCCAAAAA"),
findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"));
assertEquals(Collections.singletonList("AAAAAAAAAA"),
findRepeatedDnaSequences("AAAAAAAAAAAA"));
assertEquals(Collections.singletonList("BBBBBBBBBB"),
findRepeatedDnaSequences("BBBBBBBBBBBB"));
}
}