Added unescaping of XML entities and char refs
This commit is contained in:
parent
f752a883f1
commit
459474323c
1 changed files with 59 additions and 0 deletions
|
@ -1,7 +1,9 @@
|
||||||
package net.vhati.modmanager.core;
|
package net.vhati.modmanager.core;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import org.xml.sax.SAXParseException;
|
import org.xml.sax.SAXParseException;
|
||||||
|
@ -31,6 +33,8 @@ import org.jdom2.input.JDOMParseException;
|
||||||
* An attribute name can start right after the quote from a prior value.
|
* An attribute name can start right after the quote from a prior value.
|
||||||
* Namespace prefixes for nodes and attributes are unique.
|
* Namespace prefixes for nodes and attributes are unique.
|
||||||
* (Each prefix will be used as the namespace's URI).
|
* (Each prefix will be used as the namespace's URI).
|
||||||
|
* Unrecognized named entities (&...;) and lone ampersands are accepted
|
||||||
|
* as literal text. (Those ampersands will be escaped if outputted).
|
||||||
*
|
*
|
||||||
* Only use this as a last resort, after a real parser fails.
|
* Only use this as a last resort, after a real parser fails.
|
||||||
*/
|
*/
|
||||||
|
@ -45,8 +49,10 @@ public class SloppyXMLParser {
|
||||||
private Pattern strayCharsPtn = Pattern.compile( "(\\s*)(?:-->|[-.>,])" );
|
private Pattern strayCharsPtn = Pattern.compile( "(\\s*)(?:-->|[-.>,])" );
|
||||||
|
|
||||||
private Pattern attrPtn = Pattern.compile( "\\s*(?:([\\w.-]+):)?([\\w.-]+)\\s*=\\s*(\"[^\"]*\"|'[^']*')" );
|
private Pattern attrPtn = Pattern.compile( "\\s*(?:([\\w.-]+):)?([\\w.-]+)\\s*=\\s*(\"[^\"]*\"|'[^']*')" );
|
||||||
|
private Pattern entityPtn = Pattern.compile( "&(?:(?:#([0-9]+))|(?:#x([0-9A-Fa-f]+))|([^;]+));" );
|
||||||
|
|
||||||
private List<Pattern> chunkPtns = new ArrayList<Pattern>();
|
private List<Pattern> chunkPtns = new ArrayList<Pattern>();
|
||||||
|
private Map<String,String> entityMap = new HashMap<String,String>();
|
||||||
|
|
||||||
private JDOMFactory factory;
|
private JDOMFactory factory;
|
||||||
|
|
||||||
|
@ -66,6 +72,12 @@ public class SloppyXMLParser {
|
||||||
chunkPtns.add( eTagPtn );
|
chunkPtns.add( eTagPtn );
|
||||||
chunkPtns.add( endSpacePtn );
|
chunkPtns.add( endSpacePtn );
|
||||||
chunkPtns.add( strayCharsPtn );
|
chunkPtns.add( strayCharsPtn );
|
||||||
|
|
||||||
|
entityMap.put( "lt", "<" );
|
||||||
|
entityMap.put( "gt", ">" );
|
||||||
|
entityMap.put( "amp", "&" );
|
||||||
|
entityMap.put( "apos", "'" );
|
||||||
|
entityMap.put( "quot", "\"" );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -135,6 +147,7 @@ public class SloppyXMLParser {
|
||||||
String attrName = am.group( 2 );
|
String attrName = am.group( 2 );
|
||||||
String attrValue = am.group( 3 );
|
String attrValue = am.group( 3 );
|
||||||
attrValue = attrValue.substring( 1, attrValue.length()-1 );
|
attrValue = attrValue.substring( 1, attrValue.length()-1 );
|
||||||
|
attrValue = unescape( attrValue );
|
||||||
|
|
||||||
if ( attrPrefix != null ) {
|
if ( attrPrefix != null ) {
|
||||||
if ( attrPrefix.equals( "xmlns" ) ) {
|
if ( attrPrefix.equals( "xmlns" ) ) {
|
||||||
|
@ -175,6 +188,7 @@ public class SloppyXMLParser {
|
||||||
}
|
}
|
||||||
else if ( chunkPtn == eTagPtn ) {
|
else if ( chunkPtn == eTagPtn ) {
|
||||||
String interimText = m.group( 1 );
|
String interimText = m.group( 1 );
|
||||||
|
interimText = unescape( interimText );
|
||||||
factory.addContent( parentNode, factory.text( interimText ) );
|
factory.addContent( parentNode, factory.text( interimText ) );
|
||||||
parentNode = parentNode.getParent();
|
parentNode = parentNode.getParent();
|
||||||
}
|
}
|
||||||
|
@ -219,6 +233,51 @@ public class SloppyXMLParser {
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unescapes standard named entities and numeric character references.
|
||||||
|
* This applies to attributes and element values.
|
||||||
|
*
|
||||||
|
* They are: lt, gt, quot, apos, amp, #1234, #x1a2b.
|
||||||
|
*/
|
||||||
|
public String unescape( String s ) {
|
||||||
|
StringBuffer buf = new StringBuffer( s.length() );
|
||||||
|
Matcher m = entityPtn.matcher( s );
|
||||||
|
String decRef;
|
||||||
|
String hexRef;
|
||||||
|
int charCode;
|
||||||
|
String entName;
|
||||||
|
String entity;
|
||||||
|
|
||||||
|
while ( m.find() ) {
|
||||||
|
decRef = m.group( 1 );
|
||||||
|
hexRef = m.group( 2 );
|
||||||
|
entName = m.group( 3 );
|
||||||
|
if ( (decRef != null) ) {
|
||||||
|
// Decimal character reference.
|
||||||
|
charCode = Integer.parseInt( decRef );
|
||||||
|
entity = Character.toString( (char)charCode );
|
||||||
|
}
|
||||||
|
else if ( (hexRef != null) ) {
|
||||||
|
// Hex character reference.
|
||||||
|
charCode = Integer.parseInt( hexRef, 16 );
|
||||||
|
entity = Character.toString( (char)charCode );
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
entity = entityMap.get( entName );
|
||||||
|
if ( entity == null ) {
|
||||||
|
// Unknown entity, repeat it as-is.
|
||||||
|
entity = "&"+ entName +";";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.appendReplacement( buf, entity );
|
||||||
|
}
|
||||||
|
m.appendTail( buf );
|
||||||
|
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns lineNum and colNum for a position in text.
|
* Returns lineNum and colNum for a position in text.
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue