Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 112 additions & 28 deletions core/src/main/java/org/incenp/obofoundry/sssom/TSVWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import org.incenp.obofoundry.sssom.model.ExtensionDefinition;
import org.incenp.obofoundry.sssom.model.ExtensionValue;
Expand Down Expand Up @@ -72,9 +71,6 @@
*/
public class TSVWriter extends SSSOMWriter {

private static final Pattern tsvSpecialChars = Pattern.compile("[\t\n\r\"]");
private static final Pattern csvSpecialChars = Pattern.compile("[,\n\r\"]");

private BufferedWriter tsvWriter, metaWriter;
private Set<String> usedPrefixes = new HashSet<String>();
private boolean isCSV = false;
Expand Down Expand Up @@ -630,16 +626,7 @@ public void visit(StringSlot<Mapping> slot, Mapping object, List<String> values)
results.add("");
return;
}

StringBuilder sb = new StringBuilder();
for ( int i = 0, n = values.size(); i < n; i++ ) {
String value = values.get(i);
sb.append(escapeTSV(value));
if ( i < n - 1 ) {
sb.append('|');
}
}
results.add(sb.toString());
results.add(escapeTSV(values));
}

@Override
Expand All @@ -649,15 +636,7 @@ public void visit(EntityReferenceSlot<Mapping> slot, Mapping object, List<String
return;
}

StringBuilder sb = new StringBuilder();
for ( int i = 0, n = values.size(); i < n; i++ ) {
String value = values.get(i);
sb.append(escapeTSV(prefixManager.shortenIdentifier(value)));
if ( i < n - 1 ) {
sb.append('|');
}
}
results.add(sb.toString());
results.add(escapeTSV(prefixManager.shortenIdentifiers(values)));
}

@Override
Expand Down Expand Up @@ -691,12 +670,117 @@ public void visit(ExtensionSlot<Mapping> slot, Mapping object, Map<String, Exten
* https://datatracker.ietf.org/doc/html/rfc4180#section-2
*/
private String escapeTSV(String value) {
Pattern specialChars = isCSV ? csvSpecialChars : tsvSpecialChars;
if ( specialChars.matcher(value).find() ) {
return "\"" + value.replace("\"", "\"\"") + "\"";
} else {
return value;
StringBuilder sb = new StringBuilder();
int len = value.length();
boolean quotesNeeded = false;
for ( int i = 0; i < len; i++ ) {
char c = value.charAt(i);
switch ( c ) {
case ',':
if ( isCSV ) {
quotesNeeded = true;
}
break;

case '\t':
if ( !isCSV ) {
quotesNeeded = true;
}
break;

case '\n':
case '\r':
quotesNeeded = true;
break;

case '"':
quotesNeeded = true;
sb.append('"');
break;
}
sb.append(c);
}

if ( quotesNeeded ) {
sb.insert(0, '"');
sb.append('"');
}

return sb.toString();
}

/*
* Likewise, but for multi-valued slots, where in addition we need to escape
* pipe characters. The duplicated code from the previous method is unfortunate,
* but we can't simply call that method because we don't want to quote
* <em>individual</em> values here, it is the entire |-separated multivalue that
* must be quoted if any single value within it contains quote-triggering
* characters.
*/
private String escapeTSV(List<String> values) {
StringBuilder sb = new StringBuilder();
boolean quotesNeeded = false;
int nValues = values.size();
for ( int i = 0; i < nValues; i++ ) {
String value = values.get(i);
if ( i > 0 ) {
sb.append('|');
}

int len = value.length();
for ( int j = 0; j < len; j++ ) {
char c = value.charAt(j);
switch ( c ) {
case ',':
if ( isCSV ) {
quotesNeeded = true;
}
break;

case '\t':
if ( !isCSV ) {
quotesNeeded = true;
}
break;

case '\r':
case '\n':
quotesNeeded = true;
break;

case '"':
quotesNeeded = true;
sb.append('"');
break;

case '\\':
// The backslash needs escaping only if (1) it is followed by another backslash
// or a pipe, or (2) it is the last character of the current value and there are
// more values to follow.
if ( j < len - 1 ) {
char next = value.charAt(j + 1);
if ( next == '\\' || next == '|' ) {
sb.append('\\');
}
} else if ( i < nValues - 1 ) {
sb.append('\\');
}
break;

case '|':
sb.append('\\');
break;
}
sb.append(c);
}
}

if ( quotesNeeded ) {
sb.insert(0, '"');
sb.append('"');
}

return sb.toString();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class YAMLConverter {
private ExtensionSlotManager extensionManager;
private ExtraMetadataPolicy extraPolicy = ExtraMetadataPolicy.NONE;
private Version assumedVersion = Version.SSSOM_1_0;
private boolean supportEscapedPipes = false;

/**
* Creates a new YAML converter.
Expand Down Expand Up @@ -184,6 +185,7 @@ public MappingSet convertMappingSet(Map<String, Object> rawMap) throws SSSOMForm
if ( version == Version.UNKNOWN ) {
version = Version.LATEST;
}
supportEscapedPipes = version != Version.SSSOM_1_0;

// Process the CURIE map, so that we can expand CURIEs as soon as possible
Object rawCurieMap = rawMap.getOrDefault("curie_map", new HashMap<String, String>());
Expand Down Expand Up @@ -400,7 +402,7 @@ private List<String> getListOfStrings(String slotName, Object rawValue) throws S
* they were single-valued, which is strictly speaking but happens in the wild
* (including in the examples shown in the SSSOM documentation!).
*/
for ( String item : rawValue.toString().split("\\|") ) {
for ( String item : splitString(rawValue.toString()) ) {
value.add(item);
}
} else {
Expand All @@ -409,6 +411,49 @@ private List<String> getListOfStrings(String slotName, Object rawValue) throws S
return value;
}

/*
* Splits a string along pipe (`|`) characters, with support for backslash
* escaping.
*/
private List<String> splitString(String value) {
ArrayList<String> list = new ArrayList<>();
StringBuilder sb = new StringBuilder();
int len = value.length();
boolean escaped = false;
for ( int i = 0; i < len; i++ ) {
char c = value.charAt(i);
if ( escaped ) {
sb.append(c);
escaped = false;
} else if ( c == '\\' && supportEscapedPipes ) {
// The backslash is treated as an escape character only if it is followed by
// another backslash or a pipe.
if ( i < len - 1 ) {
char next = value.charAt(i + 1);
if ( next == '\\' || next == '|' ) {
escaped = true;
}
}
// Otherwise it is a normal character.
if ( !escaped ) {
sb.append(c);
}
} else if ( c == '|' ) {
if ( sb.length() > 0 ) {
list.add(sb.toString());
sb.delete(0, sb.length());
}
} else {
sb.append(c);
}
}
if ( sb.length() > 0 ) {
list.add(sb.toString());
}

return list;
}

/*
* Parses the "extension_definitions" key.
*/
Expand Down
32 changes: 32 additions & 0 deletions core/src/test/java/org/incenp/obofoundry/sssom/TSVReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,38 @@ void testEscapedTSV() throws IOException, SSSOMFormatException {
Assertions.assertEquals("Value\u0009with\u0009tab\u0009characters", m.getComment());
Assertions.assertEquals("Value with \"quote\" characters", m.getObjectLabel());
Assertions.assertEquals("Value with\nnew line character", m.getIssueTrackerItem());
Assertions.assertEquals("Alice", m.getAuthorLabel().get(0));
Assertions.assertEquals("Bob\tand\tCharlie", m.getAuthorLabel().get(1));
}

/*
* Test that the parser can handle escaped pipe characters in multi-valued
* slots.
*/
@Test
void testEscapedPipe() throws IOException, SSSOMFormatException {
TSVReader reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv");
reader.setAssumedVersion(Version.SSSOM_1_1);
MappingSet ms = reader.read();
Mapping m = ms.getMappings().get(0);

Assertions.assertEquals("Alice|Bob", m.getAuthorLabel().get(0));
Assertions.assertEquals("Charlie", m.getAuthorLabel().get(1));

m = ms.getMappings().get(1);
Assertions.assertEquals(3, m.getAuthorLabel().size());
Assertions.assertEquals("Alice\\Bob", m.getAuthorLabel().get(0));
Assertions.assertEquals("Charlie\\", m.getAuthorLabel().get(1));
Assertions.assertEquals("David\\|Eve\\", m.getAuthorLabel().get(2));

// Try again in SSSOM 1.0 compliance mode; there should be no escaping
reader = new TSVReader("src/test/resources/sets/test-escaping-pipe.sssom.tsv");
reader.setAssumedVersion(Version.SSSOM_1_0);
m = reader.read().getMappings().get(0);

Assertions.assertEquals("Alice\\", m.getAuthorLabel().get(0));
Assertions.assertEquals("Bob", m.getAuthorLabel().get(1));
Assertions.assertEquals("Charlie", m.getAuthorLabel().get(2));
}

/*
Expand Down
19 changes: 19 additions & 0 deletions core/src/test/java/org/incenp/obofoundry/sssom/TSVWriterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,29 @@ void testEscapingTSV() throws IOException, SSSOMFormatException {
ms.getMappings().get(0).setComment("Value\twith\ttab\tcharacters");
ms.getMappings().get(0).setObjectLabel("Value with \"quote\" characters");
ms.getMappings().get(0).setIssueTrackerItem("Value with\nnew line character");
ms.getMappings().get(0).getAuthorLabel(true).add("Alice");
ms.getMappings().get(0).getAuthorLabel().add("Bob\tand\tCharlie");

assertWrittenAsExpected(ms, "test-escaping-tsv", null, null, null);
}

@Test
void testEscapingPipeCharacter() throws IOException, SSSOMFormatException {
MappingSet ms = getTestSet();
ms.setMappingSetId("https://example.org/sets/test-escaping-pipe");
ms.getMappings().get(0).getAuthorLabel(true).add("Alice|Bob");
ms.getMappings().get(0).getAuthorLabel().add("Charlie");

Mapping m2 = ms.getMappings().get(0).toBuilder().subjectId("https://example.org/entities/0002")
.authorLabel(new ArrayList<>()).build();
m2.getAuthorLabel().add("Alice\\Bob");
m2.getAuthorLabel().add("Charlie\\");
m2.getAuthorLabel().add("David\\|Eve\\");
ms.getMappings().add(m2);

assertWrittenAsExpected(ms, "test-escaping-pipe", null, null, null);
}

@Test
void testWritingEnumValuesInYAML() throws IOException, SSSOMFormatException {
MappingSet ms = getTestSet();
Expand Down
8 changes: 8 additions & 0 deletions core/src/test/resources/sets/test-escaping-pipe.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#curie_map:
# COMENT: https://example.com/entities/
# ORGENT: https://example.org/entities/
#mapping_set_id: https://example.org/sets/test-escaping-pipe
#license: https://creativecommons.org/licenses/by/4.0/
subject_id subject_label predicate_id object_id object_label mapping_justification author_label
ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration Alice\|Bob|Charlie
ORGENT:0002 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration Alice\Bob|Charlie\\|David\\\|Eve\
4 changes: 2 additions & 2 deletions core/src/test/resources/sets/test-escaping-tsv.sssom.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
# ORGENT: https://example.org/entities/
#mapping_set_id: https://example.org/sets/test-escaping-tsv
#license: https://creativecommons.org/licenses/by/4.0/
subject_id subject_label predicate_id object_id object_label mapping_justification issue_tracker_item comment
ORGENT:0001 Value with , characters skos:closeMatch COMENT:0011 "Value with ""quote"" characters" semapv:ManualMappingCuration "Value with
subject_id subject_label predicate_id object_id object_label mapping_justification author_label issue_tracker_item comment
ORGENT:0001 Value with , characters skos:closeMatch COMENT:0011 "Value with ""quote"" characters" semapv:ManualMappingCuration "Alice|Bob and Charlie" "Value with
new line character" "Value with tab characters"