This repository was archived by the owner on May 10, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathParser.java
More file actions
106 lines (96 loc) · 2.78 KB
/
Parser.java
File metadata and controls
106 lines (96 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Scanner;
/**
* Parses through the .gbk file
*
* @author thomasreinking
*
*/
public class Parser {
private String GeneString;
/**
* Sets up and completes the parsing for the file and adds the gene block to a
* string
*
* @param file
* The file to be parsed.
*/
public Parser(File file) {
GeneString = "";
try {
Scanner scan = new Scanner(file);
while (scan.hasNextLine()) {
String line = scan.nextLine();
if (line.contains("ORIGIN")) {
boolean geneBlock = true;
while (geneBlock) {
/* Found a gene block */
String newLine = scan.nextLine();
if (!newLine.contains("//")) {
if (newLine.contains("N") || newLine.contains("n")) {
/* N is in this new line, must move subseq to the last available spot */
boolean foundN = false;
int nSpot = 0;
for (int i = 0; i < newLine.length(); i++) {
if (!foundN) {
/* Searching for n */
if (newLine.charAt(i) == 'N' || newLine.charAt(i) == 'n') {
/* Found the n, update nSpot */
foundN = true;
nSpot = i;
} else {
if (Character.isLetter(newLine.charAt(i)))
GeneString += newLine.charAt(i);
}
}
}
if (foundN) {
/* An N was found in this line, must start new subseq after last N */
if (nSpot != 10)
/* At n at index 10 means that the first char of subseq is n */
GeneString += "\n";
for (int i = nSpot; i < newLine.length(); i++) {
if (newLine.charAt(i) != 'N' && newLine.charAt(i) != 'n'
&& newLine.charAt(i) != ' ') {
GeneString += newLine.charAt(i);
}
}
}
} else {
/*
* No n in this line, just replace all non allowable chars and add to geneString
*/
GeneString += newLine.replaceAll("[^AaGgTtCc]", "");
}
} else {
geneBlock = false;
GeneString += "\n"; /*
* If the file contains more than one gene block then they will be
* separated
*/
}
}
}
}
} catch (FileNotFoundException e) {
System.out.println("The file, " + file + ", cannot be found.");
}
}
/**
* Gets the geneString generated from parsing the file
*
* @return geneString of the file
*/
public String getGeneString() {
return GeneString;
}
/**
* Splits the geneStrings into the correct Sub sequences
*
* @return An array of String[] that has all the sub sequences
*/
public String[] getSubGeneStrings() {
return GeneString.split("\\r?\\n");
}
}