...
Create the corresponding test harness.
Code Block |
---|
title | CSVTestCSVTests.java |
---|
|
public class CSVTests {
@Test
public void testNewline() throws IOException, RecognitionException {
CSVParser parser = createParser("\n");
parser.line();
}
@Test
public void testCRLF() throws IOException, RecognitionException {
CSVParser parser = createParser("\r\n");
parser.line();
}
private CSVParser createParser(String testString) throws IOException {
CharStream stream = new ANTLRStringStream(testString);
CSVLexer lexer = new CSVLexer(stream);
CommonTokenStream tokens = new CommonTokenStream(lexer);
CSVParser parser = new CSVParser(tokens);
return parser;
}
}
|
Generate the Java files for this grammar, then run the test.
...
Code Block |
---|
title | CSVTests.java fragment |
---|
|
@Test
public void testNewline() throws IOException, RecognitionException {
CSVParser parser = createParser("\n");
List<String> result = parser.line(); // final public void line...
assert result.isEmpty() : "Nothing to return";
}
@Test
public void testCRLF() throws IOException, RecognitionException {
CSVParser parser = createParser("\r\n");
List<String> result = parser.line();
assert result.isEmpty() : "Nothing to return";
}
|
As expected, this fails to compile: line
is declared as final public void line
...
Code Block |
---|
|
public final List<String> line() throws RecognitionException {
List<String> result = null;
|
...
Code Block |
---|
|
grammar CSV;
line returns [List<String> result]
@init {
result = new ArrayList<String>();
}
: field NEWLINE;
field returns [String parsedItem]
: f=FIELD { $parsedItem = $f.text;}
| // nothing
;
NEWLINE : '\r'? '\n';
FIELD : NONBREAKING* ;
// Anything except a line-breaking character is allowed.
NONBREAKING
: ~('\r' | '\n');
|
Info |
---|
title | What's that funny character? |
---|
|
"~" means "not" and is used to match any item that's not in a set. See The Definitive ANTLR Reference, page 95. |
...
Code Block |
---|
title | CSVTests.java fragment |
---|
|
@Test
public void testMultipleWords() throws IOException, RecognitionException {
CSVParser parser = createParser("Red,Green,,Blue\n");
List<String> result = parser.line();
assert result.size() == 4: "Expected 4 items";
assert result.get(0).equals("Red") : "Expected Red";
assert result.get(1).equals("Green") : "Expected Green";
assert result.get(2).equals("") : "Expected empty";
assert result.get(3).equals("Blue") : "Expected Blue";
}
|
...
Code Block |
---|
|
grammar CSV;
line returns [List<String> result]
@init {
result = new ArrayList<String>();
}
: (NEWLINE) => NEWLINE
| (
fieldResult=field { result.add(fieldResult); }
( COMMA fieldResult=field {result.add(fieldResult);} )*
NEWLINE
)
;
field returns [String parsedItem]
@init {
parsedItem = "";
}
: f=FIELD {$parsedItem=$f.text;}
| // nothing
;
NEWLINE : '\r'? '\n';
COMMA : ',';
FIELD: NONBREAKING+;
// Anything except a line-breaking character is allowed.
fragment NONBREAKING
: ~('\r' | '\n' | ',');
|
This works, but line
is getting cluttered.
...
Code Block |
---|
|
grammar CSV;
/* Old definitions commented out:
line returns [List<String> result]
@init { result = new ArrayList<String>(); }
: (NEWLINE) => NEWLINE
| (
fieldResult=field { result.add(fieldResult); }
( COMMA fieldResult=field {result.add(fieldResult);} )*
NEWLINE
)
;
field returns [String parsedItem]
@init { parsedItem = ""; }
: (f=FIELD {$parsedItem=$f.text;}
| // nothing
)
{ fields.add($parsedItem); }
;
*/
// New definitions:
line returns [List<String> result]
scope { List fields; }
@init { $line::fields = new ArrayList(); }
: (
(NEWLINE) => NEWLINE
| field (COMMA field)* NEWLINE
)
{ $result = $line::fields; }
;
field
: ( f=FIELD
| // nothing
)
{ $line::fields.add($f.text); }
;
NEWLINE : '\r'? '\n';
COMMA : ',';
FIELD: NONBREAKING+;
// Anything except a line-breaking character is allowed.
fragment NONBREAKING
: ~('\r' | '\n' | ',');
|
Since field
no longer returns a string, we'll need to alter the test to pass the value through line
and add a newline to the end of the line:
...
Code Block |
---|
|
grammar CSV;
line returns [List<String> result]
scope { List fields; }
@init { $line::fields = new ArrayList(); }
: (
(NEWLINE) => NEWLINE
| field (COMMA field)* NEWLINE
)
{ $result = $line::fields; }
;
field
: ( f=QUOTED
| f=UNQUOTED
| // nothing
)
{ $line::fields.add(($f == null) ? "" : $f.text); }
;
NEWLINE : '\r'? '\n';
COMMA : ',';
QUOTED : '"' ( options {greedy=false;} : . )* '"'
{
// Strip the surrounding quotes
String txt = getText();
setText(txt.substring(1, txt.length() -1));
};
UNQUOTED : ~('\r' | '\n' | ',' | ' ' | '"')+;
|
This gets the job done.
Quoting, part 2
...
Code Block |
---|
|
QUOTED : ('"' ( options {greedy=false;}: . )+ '"')+
{
StringBuffer txt = new StringBuffer(getText());
// Remove first and last double-quote
txt.deleteCharAt(0);
txt.deleteCharAt(txt.length()-1);
// "" -> "
int probe;
while ((probe = txt.lastIndexOf("\"\"")) >= 0) {
txt.deleteCharAt(probe);
}
setText(txt.toString());
};
|
Remove spaces around commas
...
Code Block |
---|
|
grammar CSV;
@lexer::members {
List<RecognitionException> exceptions = new ArrayList<RecognitionException>();
public List<RecognitionException> getExceptions() {
return exceptions;
}
@Override
public void reportError(RecognitionException e) {
super.reportError(e);
exceptions.add(e);
}
line returns [List<String> result]
scope { List fields; }
@init { $line::fields = new ArrayList(); }
: (
(NEWLINE) => NEWLINE
| field (COMMA field)* NEWLINE
)
{ $result = $line::fields; }
;
field
: ( f=QUOTED
| f=UNQUOTED
| // nothing
)
{ $line::fields.add(($f == null) ? "" : $f.text); }
;
NEWLINE : '\r'? '\n';
COMMA : ',';
QUOTED : ('"' ( options {greedy=false;}: . )+ '"')+
{
StringBuffer txt = new StringBuffer(getText());
// Remove first and last double-quote
txt.deleteCharAt(0);
txt.deleteCharAt(txt.length()-1);
// "" -> "
int probe;
while ((probe = txt.lastIndexOf("\"\"")) >= 0) {
txt.deleteCharAt(probe);
}
setText(txt.toString());
};
// Anything except a line-breaking character is allowed.
UNQUOTED
: ~('\r' | '\n' | ',' | ' ')+;
|
Regenerate the lexer and parser, then run the tests.
...