diff --git a/pom.xml b/pom.xml index 013b1c1..84ad971 100644 --- a/pom.xml +++ b/pom.xml @@ -8,14 +8,41 @@ de.uros.citlab textalignment - 1.0.2-SNAPSHOT + 1.0.3 + - de.uros.citlab + de.uros.citlab + ../CITlabParentPom/pom.xml parent_pom - 1.5 + 1.6 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.2 + + + package + + shade + + + + + de.uros.citlab.textalignment.AlignText + + + + + + + + + diff --git a/src/main/java/de/uros/citlab/textalignment/AlignText.java b/src/main/java/de/uros/citlab/textalignment/AlignText.java new file mode 100644 index 0000000..c6e2025 --- /dev/null +++ b/src/main/java/de/uros/citlab/textalignment/AlignText.java @@ -0,0 +1,141 @@ +package de.uros.citlab.textalignment; + +import de.uros.citlab.confmat.CharMap; +import de.uros.citlab.confmat.ConfMat; +import de.uros.citlab.textalignment.types.LineMatch; +//import org.junit.Assert; +//import org.junit.Test; +import java.io.File; +import java.lang.NullPointerException; +import java.io.FileNotFoundException; +import java.util.Scanner; +import java.util.*; + +public class AlignText { + + private static Random r = new Random(1234); + private static double propNaC = 0.5; + private static double doubleChar = 0.5; + private static double variance = 0.5; + private static double offsetBP = 10.0; + + public static void main(String[] args) { + + long startTime = System.currentTimeMillis(); + + TextAligner textAligner = new TextAligner(" ", 4.0, 0.2, 6.0, 0 // threshold 0.1: only very trustful matches, less than 0.01 = caution + ); + Scanner s1; + Scanner s2; + ArrayList references = new ArrayList(); + ArrayList recos = new ArrayList(); + + try { + s1 = new Scanner(new File(args[0])); + + while (s1.hasNextLine()) { + references.add(s1.nextLine()); + } + s1.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + + try { + s2 = new Scanner(new File(args[1])); + while (s2.hasNextLine()) { + recos.add(s2.nextLine()); + } + s2.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + + CharMap cm = getCharMap(args[2]); + + List recoConfMatsList = new ArrayList<>(); + for (String reco : recos) { + recoConfMatsList.add(generateConfMat(cm, reco, r)); + } + List alignmentResult = textAligner.getAlignmentResult(references, recoConfMatsList); + + List res = new LinkedList<>(); + for (int i = 0; i < alignmentResult.size(); i++) { + LineMatch lineMatch = alignmentResult.get(i); + res.add(lineMatch == null ? null : lineMatch.getReference()); + } + + int count = 0; + boolean isNull; + for (int i = 0; i < alignmentResult.size(); i++) { + isNull = alignmentResult.get(i) == null; + if (!isNull) { + count += 1; + } + } + System.out.printf("Number of aligned lines : %d out of %d \n", count, alignmentResult.size()); + + for (int i = 0; i < alignmentResult.size(); i++) { + try { + LineMatch match = alignmentResult.get(i); + String reference = match.getReference(); + double confidence = match.getConfidence(); + System.out.printf("line: %d prediction: %s reference: %s confidence: %s\n", i, recos.get(i), reference, + confidence); + } catch (NullPointerException e) { + // ignore this line + } + } + } + + public static CharMap getCharMap(String chars) { + CharMap res = new CharMap(); + for (int i=0; i < chars.length(); i++) { + res.add(chars.charAt(i)); + } + res.add(' '); + res.add("\t"); + res.add("'"); + res.add("\""); + return res; + } + + private static ConfMat generateConfMat(CharMap cm, String reference, Random rnd) { + return generateConfMat(cm, reference, rnd, propNaC, doubleChar, variance, offsetBP); + } + + private static ConfMat generateConfMat(CharMap cm, String reference, Random rnd, double propNaC, double doubleChar, + double variance, double offsetBP) { + StringBuilder sb = new StringBuilder(); + char last = CharMap.NaC; + for (int i = 0; i < reference.length(); i++) { + char cur = reference.charAt(i); + + if (cm.get(cur) == null) { + throw new RuntimeException("character '" + cur + "' is not in CharMap"); + } + if (cur == last || rnd.nextDouble() < propNaC) { + sb.append(CharMap.NaC); + } + sb.append(cur); + if (rnd.nextDouble() < doubleChar) { + sb.append(CharMap.NaC); + } + last = cur; + } + if (rnd.nextDouble() < propNaC) { + sb.append(CharMap.NaC); + } + // BestPath ready + String bp = sb.toString(); + double[][] mat = new double[bp.length()][cm.size()]; + for (int i = 0; i < mat.length; i++) { + double[] vec = mat[i]; + for (int j = 0; j < vec.length; j++) { + vec[j] = rnd.nextGaussian() * variance; + } + vec[cm.get(bp.charAt(i))] += offsetBP; + } + return new ConfMat(cm, mat); + } +} diff --git a/src/main/java/de/uros/citlab/textalignment/TextAligner.java b/src/main/java/de/uros/citlab/textalignment/TextAligner.java index d64c26c..f975bc6 100644 --- a/src/main/java/de/uros/citlab/textalignment/TextAligner.java +++ b/src/main/java/de/uros/citlab/textalignment/TextAligner.java @@ -106,7 +106,17 @@ private void init() { impl.addCostCalculator(new CostCalculatorSkipChar(1 + 2, 1)); } } + + setLoggingLevel(ch.qos.logback.classic.Level.INFO); + } + + + public static void setLoggingLevel(ch.qos.logback.classic.Level level) { + ch.qos.logback.classic.Logger root = (ch.qos.logback.classic.Logger) org.slf4j.LoggerFactory.getLogger(ch.qos.logback.classic.Logger.ROOT_LOGGER_NAME); + root.setLevel(level); + } + public void setNacOffset(double nacOffset) { this.nacOffset = nacOffset; @@ -302,4 +312,4 @@ private void setRecognition(List confMats) { } } -} \ No newline at end of file +}