Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: ooops - some stuff got left off.

...

No Format
/**
   * Given the input string with escaped unicode characters convert them
   * to their native unicode characters and return the result. This is quite
   * similar to the functionality found in property file handling. White space
   * escapes are not processed (as they are consumed by the template library).
   * Any bogus escape codes will remain in place.
   * <p>
   * When files are provided in another encoding, they can be converted to ascii using
   * the native2ascii tool (a java sdk binary). This tool will escape all the
   * non Latin1 ASCII characters and convert the file into Latin1 with unicode escapes.
   *
   * @param source
   *      string with unicode escapes
   * @return
   *      string with all unicode characters, all unicode escapes expanded.
   *
   * @author Caleb Lyness
   */
    private String unescapeUnicode(String source) {
  
     /* could use regular expression, but not this time... */
  
     final int srcLen = source.length();
  
     char c;

 

     StringBuffer buffer = new StringBuffer(srcLen);

 

     // Must have format \\uXXXX where XXXX is a hexadecimal number
  
     int i=0;
        while (i <srcLen-5) {

            c = source.charAt(i++);

            if (c=='\\') {
                char nc = source.charAt(i);
                if (nc == 'u') {

                    // Now we found the u we need to find another 4 hex digits
                    // Note: shifting left by 4 is the same as multiplying by 16
                    int v = 0; // Accumulator
                    for (int j=1; j < 5; j++) {
                        nc = source.charAt(i+j);
                        switch(nc)
                        {
                            case 48: // '0'
                            case 49: // '1'
                            case 50: // '2'
                            case 51: // '3'
                            case 52: // '4'
                            case 53: // '5'
                            case 54: // '6'
                            case 55: // '7'
                            case 56: // '8'
                            case 57: // '9'
                                v = ((v << 4) + nc) - 48;
                                break;

                            case 97: // 'a'
                            case 98: // 'b'
                            case 99: // 'c'
                            case 100: // 'd'
                            case 101: // 'e'
                            case 102: // 'f'
                                v = ((v << 4)+10+nc)-97;
                                break;

                            case 65: // 'A'
                            case 66: // 'B'
                            case 67: // 'C'
                            case 68: // 'D'
                            case 69: // 'E'
                            case 70: // 'F'
                                v = ((v << 4)+10+nc)-65;
                                break;
                            default:
                                // almost but no go
                                j = 6;  // terminate the loop
                                v = 0;  // clear the accumulator
                                break;
                        }
                    } // for each of the 4 digits

                    if (v > 0) {      // We got a full conversion
                        c = (char)v;  // Use the converted char
                        i += 5;       // skip the numeric values
                    }
                }
            }
            buffer.append(c);
        }
		
	// Fill in the remaining characters from the buffer
	while (i <srcLen) {
		buffer.append(source.charAt(i++));
	}		
	return buffer.toString();
}

Note: for better performance you want to avoid using the source.charAt method (it does a bounds check on the provided index).