From 992213acfc9c34af74d59ff0543c2527d1a8f243 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Wed, 19 Aug 2020 15:35:42 +0200 Subject: [PATCH 01/10] Create interface to read from files --- .../uros/citlab/textalignment/AlignText.java | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 src/main/java/de/uros/citlab/textalignment/AlignText.java diff --git a/src/main/java/de/uros/citlab/textalignment/AlignText.java b/src/main/java/de/uros/citlab/textalignment/AlignText.java new file mode 100644 index 0000000..b18bac7 --- /dev/null +++ b/src/main/java/de/uros/citlab/textalignment/AlignText.java @@ -0,0 +1,144 @@ +package de.uros.citlab.textalignment; + +import de.uros.citlab.confmat.CharMap; +import de.uros.citlab.confmat.ConfMat; +import de.uros.citlab.textalignment.types.LineMatch; +//import org.junit.Assert; +//import org.junit.Test; +import java.io.File; +import java.lang.NullPointerException; +import java.io.FileNotFoundException; +import java.util.Scanner; +import java.util.*; + +public class AlignText { + + private static Random r = new Random(1234); + private static double propNaC = 0.5; + private static double doubleChar = 0.5; + private static double variance = 0.5; + private static double offsetBP = 10.0; + + public static void main(String[] args) { + + long startTime = System.currentTimeMillis(); + + TextAligner textAligner = new TextAligner(" ", 4.0, 0.2, 6.0, 0 // threshold 0.1: only very trustful matches, less than 0.01 = caution + ); + Scanner s1; + Scanner s2; + ArrayList references = new ArrayList(); + ArrayList predictions = new ArrayList(); + + try { + s1 = new Scanner(new File(args[0])); + + while (s1.hasNextLine()) { + references.add(s1.nextLine()); + } + s1.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + + try { + s2 = new Scanner(new File(args[1])); + while (s2.hasNextLine()) { + predictions.add(s2.nextLine()); + } + s2.close(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + + + CharMap cm = getCharMap(args[2]); + + List predictionConfMatsList = new ArrayList<>(); + for (String pred : predictions) { + predictionConfMatsList.add(generateConfMat(cm, pred, r)); + } + List alignmentResult = textAligner.getAlignmentResult(references, predictionConfMatsList); + + List res = new LinkedList<>(); + for (int i = 0; i < alignmentResult.size(); i++) { + LineMatch lineMatch = alignmentResult.get(i); + res.add(lineMatch == null ? null : lineMatch.getReference()); + } + + int count = 0; + boolean isNull; + for (int i = 0; i < alignmentResult.size(); i++) { + isNull = alignmentResult.get(i) == null; + if (!isNull) { + count += 1; + } + } + System.out.printf("Number of aligned lines : %d out of %d \n", count, alignmentResult.size()); + + for (int i = 0; i < alignmentResult.size(); i++) { + try { + LineMatch match = alignmentResult.get(i); + String reference = match.getReference(); + double confidence = match.getConfidence(); + System.out.printf("line: %d prediction: %s reference: %s confidence: %s\n", i, predictions.get(i), reference, + confidence); + } catch (NullPointerException e) { + // ignore this line + } + + } + + long endTime = System.currentTimeMillis(); + System.out.println("That took " + (endTime - startTime) + " milliseconds"); + + } + + public static CharMap getCharMap(String chars) { + CharMap res = new CharMap(); + for (int i=0; i < chars.length(); i++) { + res.add(chars.charAt(i)); + } + res.add(' '); + return res; + } + + private static ConfMat generateConfMat(CharMap cm, String reference, Random rnd) { + return generateConfMat(cm, reference, rnd, propNaC, doubleChar, variance, offsetBP); + } + + private static ConfMat generateConfMat(CharMap cm, String reference, Random rnd, double propNaC, double doubleChar, + double variance, double offsetBP) { + StringBuilder sb = new StringBuilder(); + char last = CharMap.NaC; + for (int i = 0; i < reference.length(); i++) { + char cur = reference.charAt(i); + + if (cm.get(cur) == null) { + throw new RuntimeException("character '" + cur + "' is not in CharMap"); + } + if (cur == last || rnd.nextDouble() < propNaC) { + sb.append(CharMap.NaC); + } + sb.append(cur); + if (rnd.nextDouble() < doubleChar) { + sb.append(CharMap.NaC); + } + last = cur; + } + if (rnd.nextDouble() < propNaC) { + sb.append(CharMap.NaC); + } + // BestPath ready + String bp = sb.toString(); + double[][] mat = new double[bp.length()][cm.size()]; + for (int i = 0; i < mat.length; i++) { + double[] vec = mat[i]; + for (int j = 0; j < vec.length; j++) { + vec[j] = rnd.nextGaussian() * variance; + } + vec[cm.get(bp.charAt(i))] += offsetBP; + } + return new ConfMat(cm, mat); + } +} From f2258e7659d8fb1186380e2ef5b97cec42f14d5b Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Wed, 19 Aug 2020 15:39:03 +0200 Subject: [PATCH 02/10] Update pom.xml --- pom.xml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 013b1c1..4a972da 100644 --- a/pom.xml +++ b/pom.xml @@ -10,11 +10,31 @@ textalignment 1.0.2-SNAPSHOT + - de.uros.citlab + de.uros.citlab + ../parent_pom/pom.xml parent_pom 1.5 + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.2 + + + package + + shade + + + + + + From a3682260fbe987354de42337e2d38f1998ffced7 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Wed, 19 Aug 2020 16:00:41 +0200 Subject: [PATCH 03/10] set Logging level to INFO --- .../de/uros/citlab/textalignment/TextAligner.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main/java/de/uros/citlab/textalignment/TextAligner.java b/src/main/java/de/uros/citlab/textalignment/TextAligner.java index d64c26c..f975bc6 100644 --- a/src/main/java/de/uros/citlab/textalignment/TextAligner.java +++ b/src/main/java/de/uros/citlab/textalignment/TextAligner.java @@ -106,7 +106,17 @@ private void init() { impl.addCostCalculator(new CostCalculatorSkipChar(1 + 2, 1)); } } + + setLoggingLevel(ch.qos.logback.classic.Level.INFO); + } + + + public static void setLoggingLevel(ch.qos.logback.classic.Level level) { + ch.qos.logback.classic.Logger root = (ch.qos.logback.classic.Logger) org.slf4j.LoggerFactory.getLogger(ch.qos.logback.classic.Logger.ROOT_LOGGER_NAME); + root.setLevel(level); + } + public void setNacOffset(double nacOffset) { this.nacOffset = nacOffset; @@ -302,4 +312,4 @@ private void setRecognition(List confMats) { } } -} \ No newline at end of file +} From c93f36438768897d4809e279ea940708ab936ca2 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Wed, 19 Aug 2020 17:17:54 +0200 Subject: [PATCH 04/10] Update AlignText.java --- src/main/java/de/uros/citlab/textalignment/AlignText.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/de/uros/citlab/textalignment/AlignText.java b/src/main/java/de/uros/citlab/textalignment/AlignText.java index b18bac7..204e563 100644 --- a/src/main/java/de/uros/citlab/textalignment/AlignText.java +++ b/src/main/java/de/uros/citlab/textalignment/AlignText.java @@ -90,7 +90,7 @@ public static void main(String[] args) { } long endTime = System.currentTimeMillis(); - System.out.println("That took " + (endTime - startTime) + " milliseconds"); + System.out.println("Alignment took " + (endTime - startTime) + " milliseconds"); } From 5dadd4d06d1fc76b29d196e23df42ea133f993d0 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Wed, 2 Sep 2020 18:05:50 +0200 Subject: [PATCH 05/10] Update AlignText.java --- .../uros/citlab/textalignment/AlignText.java | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/main/java/de/uros/citlab/textalignment/AlignText.java b/src/main/java/de/uros/citlab/textalignment/AlignText.java index 204e563..8b1049f 100644 --- a/src/main/java/de/uros/citlab/textalignment/AlignText.java +++ b/src/main/java/de/uros/citlab/textalignment/AlignText.java @@ -28,8 +28,8 @@ public static void main(String[] args) { Scanner s1; Scanner s2; ArrayList references = new ArrayList(); - ArrayList predictions = new ArrayList(); - + ArrayList recos = new ArrayList(); + try { s1 = new Scanner(new File(args[0])); @@ -44,21 +44,20 @@ public static void main(String[] args) { try { s2 = new Scanner(new File(args[1])); while (s2.hasNextLine()) { - predictions.add(s2.nextLine()); + recos.add(s2.nextLine()); } s2.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } - CharMap cm = getCharMap(args[2]); - List predictionConfMatsList = new ArrayList<>(); - for (String pred : predictions) { - predictionConfMatsList.add(generateConfMat(cm, pred, r)); + List recoConfMatsList = new ArrayList<>(); + for (String reco : recos) { + recoConfMatsList.add(generateConfMat(cm, reco, r)); } - List alignmentResult = textAligner.getAlignmentResult(references, predictionConfMatsList); + List alignmentResult = textAligner.getAlignmentResult(references, recoConfMatsList); List res = new LinkedList<>(); for (int i = 0; i < alignmentResult.size(); i++) { @@ -81,17 +80,12 @@ public static void main(String[] args) { LineMatch match = alignmentResult.get(i); String reference = match.getReference(); double confidence = match.getConfidence(); - System.out.printf("line: %d prediction: %s reference: %s confidence: %s\n", i, predictions.get(i), reference, + System.out.printf("line: %d prediction: %s reference: %s confidence: %s\n", i, recos.get(i), reference, confidence); } catch (NullPointerException e) { // ignore this line } - } - - long endTime = System.currentTimeMillis(); - System.out.println("Alignment took " + (endTime - startTime) + " milliseconds"); - } public static CharMap getCharMap(String chars) { @@ -100,6 +94,8 @@ public static CharMap getCharMap(String chars) { res.add(chars.charAt(i)); } res.add(' '); + res.add("'"); + res.add("\""); return res; } @@ -113,7 +109,7 @@ private static ConfMat generateConfMat(CharMap cm, String reference, Random rnd, char last = CharMap.NaC; for (int i = 0; i < reference.length(); i++) { char cur = reference.charAt(i); - + if (cm.get(cur) == null) { throw new RuntimeException("character '" + cur + "' is not in CharMap"); } From 2630fbabfffac0fdc550daf2ed9f1eba04d1cc40 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Thu, 3 Sep 2020 11:26:40 +0200 Subject: [PATCH 06/10] Update pom.xml --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4a972da..d770201 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ de.uros.citlab - ../parent_pom/pom.xml + ../CITlabParentPom/pom.xml parent_pom 1.5 From 2bf25eb5b4e9a0a2a388c2d6c68f1334e1cc8bf5 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Thu, 3 Sep 2020 11:36:37 +0200 Subject: [PATCH 07/10] Update pom.xml --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d770201..7d02337 100644 --- a/pom.xml +++ b/pom.xml @@ -15,7 +15,7 @@ de.uros.citlab ../CITlabParentPom/pom.xml parent_pom - 1.5 + 1.6 From 9ff2f2fa1e911f10c72f8df842b146fde8926789 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Thu, 3 Sep 2020 11:38:25 +0200 Subject: [PATCH 08/10] Update version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7d02337..f1d3b30 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ de.uros.citlab textalignment - 1.0.2-SNAPSHOT + 1.0.3 From caa1ccced112cb387b42e33d50d20e67a7817fd3 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Thu, 3 Sep 2020 14:32:39 +0200 Subject: [PATCH 09/10] use AlignText as mainClass for JAR --- pom.xml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f1d3b30..84ad971 100644 --- a/pom.xml +++ b/pom.xml @@ -17,7 +17,7 @@ parent_pom 1.6 - + @@ -30,6 +30,13 @@ shade + + + + de.uros.citlab.textalignment.AlignText + + + From f2b7de68f195d3fd55752ec0e74b5c0dda3ffca5 Mon Sep 17 00:00:00 2001 From: rtoumiteklia <66685858+rtoumiteklia@users.noreply.github.com> Date: Fri, 4 Sep 2020 12:15:34 +0200 Subject: [PATCH 10/10] Add \t in CharMap --- src/main/java/de/uros/citlab/textalignment/AlignText.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/de/uros/citlab/textalignment/AlignText.java b/src/main/java/de/uros/citlab/textalignment/AlignText.java index 8b1049f..c6e2025 100644 --- a/src/main/java/de/uros/citlab/textalignment/AlignText.java +++ b/src/main/java/de/uros/citlab/textalignment/AlignText.java @@ -94,6 +94,7 @@ public static CharMap getCharMap(String chars) { res.add(chars.charAt(i)); } res.add(' '); + res.add("\t"); res.add("'"); res.add("\""); return res;