From 30a2b05483cc2ca1dc7b3739d98107157b4fd0e6 Mon Sep 17 00:00:00 2001 From: Vhati Date: Sat, 7 Sep 2013 00:22:40 -0400 Subject: [PATCH] The sloppy parser reports line/col to the JDOMFactory for element start tags --- .../modmanager/core/SloppyXMLParser.java | 93 ++++++++++++++++--- 1 file changed, 82 insertions(+), 11 deletions(-) diff --git a/src/main/java/net/vhati/modmanager/core/SloppyXMLParser.java b/src/main/java/net/vhati/modmanager/core/SloppyXMLParser.java index de09f34..2d32f2e 100644 --- a/src/main/java/net/vhati/modmanager/core/SloppyXMLParser.java +++ b/src/main/java/net/vhati/modmanager/core/SloppyXMLParser.java @@ -38,7 +38,21 @@ import org.jdom2.input.JDOMParseException; * Unrecognized named entities (&...;) and lone ampersands are accepted * as literal text. (Those ampersands will be escaped if outputted). * + * The text must have \n line endings. + * + * If a line/column aware JDOMFactory is passed to the constructor, + * that factory will receive locations for Elements (start tags). + * That will be the 1-based line/col of the end character, + * plus 1 col. + * + * If parsing fails, the thrown JDOMParseException has getter methods + * to report the nearest upcoming non-whitespace character, from where + * the parser gave up. + * * Only use this as a last resort, after a real parser fails. + * + * @see org.jdom2.input.JDOMParseException + * @see org.jdom2.located.LocatedJDOMFactory */ public class SloppyXMLParser { @@ -53,6 +67,8 @@ public class SloppyXMLParser { private Pattern attrPtn = Pattern.compile( "\\s*(?:([\\w.-]+):)?([\\w.-]+)\\s*=\\s*(\"[^\"]*\"|'[^']*')" ); private Pattern entityPtn = Pattern.compile( "&(?:(?:#([0-9]+))|(?:#x([0-9A-Fa-f]+))|([^;]+));" ); + private Pattern breakPtn = Pattern.compile( "\n" ); + private List chunkPtns = new ArrayList(); private Map entityMap = new HashMap(); @@ -91,6 +107,7 @@ public class SloppyXMLParser { int sLen = s.length(); int lastPos = -1; int pos = 0; + int[] lastLineAndCol = new int[] {0, 0}; // Counts \n's and chars after the last \n. String tmp = null; Matcher m = declPtn.matcher( s ); @@ -104,6 +121,7 @@ public class SloppyXMLParser { if ( chunkPtn == declPtn ) { // Don't care. + addLineAndCol( lastLineAndCol, m.group(0) ); } else if ( chunkPtn == commentPtn ) { String whitespace = m.group( 1 ); @@ -138,6 +156,8 @@ public class SloppyXMLParser { factory.addContent( parentNode, commentNode ); } } + + addLineAndCol( lastLineAndCol, s, m.start(), m.end() ); } else if ( chunkPtn == cdataPtn ) { String whitespace = m.group( 1 ); @@ -146,6 +166,8 @@ public class SloppyXMLParser { CDATA cdataNode = factory.cdata( m.group(2) ); factory.addContent( parentNode, cdataNode ); + + addLineAndCol( lastLineAndCol, s, m.start(), m.end() ); } else if ( chunkPtn == sTagPtn ) { String whitespace = m.group( 1 ); @@ -157,13 +179,15 @@ public class SloppyXMLParser { String attrString = m.group( 4 ); boolean selfClosing = ( m.group( 5 ).length() > 0 ); + addLineAndCol( lastLineAndCol, s, m.start(), m.end() ); + Element tagNode; if ( nodePrefix != null ) { Namespace nodeNS = Namespace.getNamespace( nodePrefix, nodePrefix ); // URI? *shrug* factory.addNamespaceDeclaration( rootNode, nodeNS ); - tagNode = factory.element( nodeName, nodeNS ); + tagNode = factory.element( lastLineAndCol[0]+1, lastLineAndCol[1]+1+1, nodeName, nodeNS ); } else { - tagNode = factory.element( nodeName ); + tagNode = factory.element( lastLineAndCol[0]+1, lastLineAndCol[1]+1+1, nodeName ); } if ( attrString.length() > 0 ) { @@ -218,8 +242,11 @@ public class SloppyXMLParser { else if ( chunkPtn == eTagPtn ) { String interimText = m.group( 1 ); interimText = unescape( interimText ); + factory.addContent( parentNode, factory.text( interimText ) ); parentNode = parentNode.getParent(); + + addLineAndCol( lastLineAndCol, s, m.start(), m.end() ); } else if ( chunkPtn == endSpacePtn ) { // This is the end of the document. @@ -230,6 +257,8 @@ public class SloppyXMLParser { String whitespace = m.group( 1 ); if ( whitespace.length() > 0 ) factory.addContent( parentNode, factory.text( whitespace ) ); + + addLineAndCol( lastLineAndCol, s, m.start(), m.end() ); } matchedChunk = true; @@ -325,24 +354,66 @@ public class SloppyXMLParser { /** - * Returns lineNum and colNum for a position in text. + * Increments an ongoing tally of lines and the col on the current line. + * + * @param lastLineAndCol the current tally to increment (0-based) + * @param s a string to check for \n's + * @param start a start index in the string to search from (inclusive) + * @param start an end index in the string (exclusive) */ - public int[] getLineAndCol( CharSequence s, int pos ) { - Matcher breakMatcher = Pattern.compile( "\n" ).matcher( s ); - breakMatcher.region( 0, pos+1 ); + private void addLineAndCol( int[] lastLineAndCol, CharSequence s, int start, int end ) { + if ( s.length() == 0 || start == end ) return; + + Matcher breakMatcher = breakPtn.matcher( s ); + breakMatcher.region( start, end ); + int breakCount = 0; int lastBreakPos = -1; - int lineNum = 1; while ( breakMatcher.find() ) { lastBreakPos = breakMatcher.start(); - breakMatcher.region( breakMatcher.end(), breakMatcher.regionEnd() ); - lineNum++; + breakCount++; + } + if ( lastBreakPos == -1 ) { + // Same line, a few more chars in. Increment col. + lastLineAndCol[1] += end-1 - start; + } else { + // On a new line now, reset the col. + lastLineAndCol[0] += breakCount; + lastLineAndCol[1] = end-1 - lastBreakPos; + } + } + + private void addLineAndCol( int[] lastLineAndCol, CharSequence s ) { + addLineAndCol( lastLineAndCol, s, 0, s.length() ); + } + + + /** + * Returns lineNum and colNum for a position in text. + * The first line is line 1. + * Line breaks start a new lins as col 0. + * The first char of each line, after the break is col 1. + * + * @param pos a 0-based offset + * @return 1-based ints for line and col (the first char is line 1, col 1) + * @see org.jdom2.input.JDOMParseException + */ + public int[] getLineAndCol( CharSequence s, int pos ) { + pos = Math.min( pos, s.length() ); + + Matcher breakMatcher = breakPtn.matcher( s ); + breakMatcher.region( 0, pos+1 ); // Include pos itself in case it's a break. + int breakCount = 0; + int lastBreakPos = -1; + while ( breakMatcher.find() ) { + lastBreakPos = breakMatcher.start(); + breakCount++; } int colNum; if ( lastBreakPos == -1 ) - colNum = pos+1; + colNum = pos+1; // Pretend ^ was column 0, as a \n would. else colNum = pos - lastBreakPos; - return new int[] { lineNum, colNum }; + return new int[] { breakCount+1, colNum }; } }