Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: More notes on numeric validation

...

Note
titleError handling

This version now checks that numbers are in the correct format (essentially, no leading zeroes) and disables error handling so the exceptions propagate all the way up. Thanks to Nuno Job for pointing this out: http://www.nunojob.com/blog/2009/05/26/json-grammar/

You don't have to include the numeric validation code or the "disable error handling" code if you don't want to. Thanks to Nuno Job for pointing this out: http://www.nunojob.com/blog/2009/05/26/json-grammar/Another option would be to move the numeric validity check into the tree parser (extractNumber), which would have the advantage of leaving the parser and lexer language-independent (and, truth be told, would probably be cleaner.)

Here's the front end:

Code Block
titleJSON.g
grammar JSON;

options {
	output = AST;
}

tokens {
	STRING; NUMBER; OBJECT; FIELD; ARRAY;
	COMMA = ',';
	TRUE; FALSE; NULL;
}

@header {
package net.nextquestion.json;

import java.util.regex.Pattern;

}

@lexer::header {
package net.nextquestion.json;
}

// Optional step: Disable automatic error recovery
@members {
protected void mismatch(IntStream input, int ttype, BitSet follow)
throws RecognitionException
{
throw new MismatchedTokenException(ttype, input);
}
public Object recoverFromMismatchedSet(IntStream input,
RecognitionException e,
BitSet follow)
throws RecognitionException
{
throw e;
}
}
// Alter code generation so catch-clauses get replace with
// this action.
@rulecatch {
catch (RecognitionException e) {
throw e;
}
} 
value

	: string
	| number
	| object
	| array
	| 'true' -> TRUE
	| 'false' -> FALSE
	| 'null' -> NULL
	;

string 	: String
	  -> ^(STRING String)
	;

// If you want to conform to the RFC, use a validating semantic predicate to check the result.
// You can omit the check if you want. The parser will still recognize valid JSON and it will still recognize valid JSON and it will
// allow numbers with leading zeroes
// allow numbers with leading zeroes.
// See the second note above for an alternate approach using the tree parser.
// This could be more efficient (e.g. pre-compile the pattern), but I'm going for clarity here.
number	: n=Number {Pattern.matches("(0|(-?[1-9]\\d*))(\\.\\d+)?", n.getText())}?
	    Exponent?
	  -> ^(NUMBER Number Exponent?)
	;

object	: '{' members '}'
	  -> ^(OBJECT members)
	;

array	: '[' elements ']'
	  -> ^(ARRAY elements)
	;

elements: value (COMMA! value)*
	;

members	: pair (COMMA! pair)*
    ;

pair	: String ':' value
	  -> ^(FIELD String value)
	;
// Simple, but more permissive than the RFC allows. See number above for a validity check.
Number	: '-'? Digit+ ( '.' Digit+)?;

Exponent: ('e'|'E') '-'? Digit+;

String 	:
	'"' ( EscapeSequence | ~('\u0000'..'\u001f' | '\\' | '\"' ) )* '"'
	;

WS: (' '|'\n'|'\r'|'\t')+ {$channel=HIDDEN;} ; // ignore whitespace

fragment EscapeSequence
    	:   '\\' (UnicodeEscape |'b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    	;

fragment UnicodeEscape
	: 'u' HexDigit HexDigit HexDigit HexDigit
	;

fragment HexDigit
	: '0'..'9' | 'A'..'F' | 'a'..'f'
	;

fragment Digit
	: '0'..'9'
	;


...

Code Block
titleJsonTree.g
tree grammar JSONTree;

options {
tokenVocab=JSON; // reuse token types
ASTLabelType=CommonTree; // $label will have type CommonTree
}

@header {
package net.nextquestion.json;

import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.io.ByteArrayOutputStream;
import java.io.OutputStreamWriter;

}

@members {
    private Object extractNumber(CommonTree numberToken, CommonTree exponentToken) {
        String numberBody = numberToken.getText();
        // you could choose to validate numberBody here instead of in the parser
        String exponent = (exponentToken == null) ? null : exponentToken.getText().substring(1); // remove the 'e' prefix if there
        boolean isReal = numberBody.indexOf('.') >= 0 || exponent != null;
        if (!isReal) {
            return new Integer(numberBody);
        } else {
            double result = Double.parseDouble(numberBody);
            if (exponent != null) {
                result = result * Math.pow(10.0f, Double.parseDouble(exponent));
            }
            return new Double(result);
        }
    }

    private String extractString(CommonTree token) {
        // StringBuffers are an efficient way to modify strings
        StringBuffer sb = new StringBuffer(token.getText());
        // Process character escapes
        int startPoint = 1; // skip initial quotation mark
        for (;;) {
            int slashIndex = sb.indexOf("\\", startPoint); // search for a single backslash
            if (slashIndex == -1) break;
            // Else, we have a backslash
            char escapeType = sb.charAt(slashIndex + 1);
            switch (escapeType) {
                case'u':
                    // Unicode escape.
                    String unicode = extractUnicode(sb, slashIndex);
                    sb.replace(slashIndex, slashIndex + 6, unicode); // backspace
                    break; // back to the loop

                    // note: Java's character escapes match JSON's, which is why it looks like we're replacing
                // "\b" with "\b". We're actually replacing 2 characters (slash-b) with one (backspace).
                case 'b':
                    sb.replace(slashIndex, slashIndex + 2, "\b"); // backspace
                    break;

                case 't':
                    sb.replace(slashIndex, slashIndex + 2, "\t"); // tab
                    break;

                case 'n':
                    sb.replace(slashIndex, slashIndex + 2, "\n"); // newline
                    break;

                case 'f':
                    sb.replace(slashIndex, slashIndex + 2, "\f"); // form feed
                    break;

                case 'r':
                    sb.replace(slashIndex, slashIndex + 2, "\r"); // return
                    break;

                case '\'':
                    sb.replace(slashIndex, slashIndex + 2, "\'"); // single quote
                    break;

                case '\"':
                    sb.replace(slashIndex, slashIndex + 2, "\""); // double quote
                    break;

                case '\':
                    sb.replace(slashIndex, slashIndex + 2, "\\"); // backslash
                    break;



                case '/':
                    sb.replace(slashIndex, slashIndex + 2, "/"); // solidus
                    break;

            }
            startPoint = slashIndex+1;

        }

        // remove surrounding quotes
        sb.deleteCharAt(0);
        sb.deleteCharAt(sb.length() - 1);

        return sb.toString();
    }

    private String extractUnicode(StringBuffer sb, int slashIndex) {
        // Gather the 4 hex digits, convert to an integer, translate the number to a unicode char, replace
        String result;
        String code = sb.substring(slashIndex + 2, slashIndex + 6);
        int charNum = Integer.parseInt(code, 16); // hex to integer
        // There's no simple way to go from an int to a unicode character.
        // We'll have to pass this through an output stream writer to do
        // the conversion.
        try {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            OutputStreamWriter osw = new OutputStreamWriter(baos, "UTF-8");
            osw.write(charNum);
            osw.flush();
            result = baos.toString("UTF-8"); // Thanks to Silvester Pozarnik for the tip about adding "UTF-8"
        } catch (Exception e) {
            e.printStackTrace();
            result = null;
        }
        return result;
    }

}

value returns [Object result]
	: s=string { $result = s; }
	| n=number { $result = n; }
	| o=object { $result = o; }
	| a=array { $result = a; }
	| TRUE { $result=Boolean.TRUE; }
	| FALSE {$result = Boolean.FALSE; }
	| NULL {$result = null; }
	;

string returns [String result]
	: ^(STRING String)
	  { $result = extractString($String); }
	;

object returns [Map result]
@init { result = new HashMap(); }
	: ^(OBJECT pair[$result]+)
	;

number	returns [Object result]
	: ^(NUMBER Number Exponent?)
	  { $result = extractNumber($Number, $Exponent); }
	;

array	returns [List list]
@init{ list = new ArrayList(); }
	: ^(ARRAY (v=value {$list.add(v); })+ )
	;

pair [Map map]
	: ^(FIELD key=String v=value)
	   { $map.put(extractString($key), v); }
	;

...