623 lines
24 KiB
Java
623 lines
24 KiB
Java
/*
|
|
* Ignore this package.
|
|
* It's for Slipstream/GMM catalog maintenance.
|
|
*/
|
|
|
|
package net.vhati.modmanager.scraper;
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.FileOutputStream;
|
|
import java.io.InputStream;
|
|
import java.io.IOException;
|
|
import java.io.OutputStream;
|
|
import java.io.OutputStreamWriter;
|
|
import java.io.PrintWriter;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.MalformedURLException;
|
|
import java.net.URL;
|
|
import java.net.URLConnection;
|
|
import java.nio.ByteBuffer;
|
|
import java.nio.charset.CharacterCodingException;
|
|
import java.nio.charset.Charset;
|
|
import java.nio.charset.CharsetDecoder;
|
|
import java.nio.charset.CharsetEncoder;
|
|
import java.security.NoSuchAlgorithmException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import com.fasterxml.jackson.databind.JsonNode;
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
import com.fasterxml.jackson.databind.node.ArrayNode;
|
|
import com.fasterxml.jackson.databind.node.ObjectNode;
|
|
|
|
import org.jdom2.Document;
|
|
import org.jdom2.Element;
|
|
import org.jdom2.JDOMException;
|
|
import org.jdom2.input.SAXBuilder;
|
|
import org.jdom2.output.Format;
|
|
import org.jdom2.output.XMLOutputter;
|
|
|
|
import org.apache.commons.cli.BasicParser;
|
|
import org.apache.commons.cli.OptionBuilder;
|
|
import org.apache.commons.cli.Options;
|
|
import org.apache.commons.cli.CommandLine;
|
|
import org.apache.commons.cli.HelpFormatter;
|
|
import org.apache.commons.cli.ParseException;
|
|
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import net.vhati.ftldat.PackUtilities;
|
|
import net.vhati.modmanager.core.ModDB;
|
|
import net.vhati.modmanager.core.ModInfo;
|
|
import net.vhati.modmanager.core.ModsInfo;
|
|
import net.vhati.modmanager.json.JacksonCatalogReader;
|
|
import net.vhati.modmanager.json.JacksonCatalogWriter;
|
|
|
|
|
|
public class ForumScraper {
|
|
|
|
private static final Logger log = LoggerFactory.getLogger( ForumScraper.class );
|
|
|
|
private static final String MASTER_LIST_URL = "https://subsetgames.com/forum/viewtopic.php?f=11&t=2645";
|
|
private static final String FORUM_URL_FRAGMENT = "https://subsetgames.com/forum/viewtopic.php";
|
|
|
|
|
|
public static void main( String[] args ) {
|
|
|
|
List<String> ignoredURLs = new ArrayList<String>();
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=11&t=11561" );
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=12&t=11083" );
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=4&t=2938" );
|
|
ignoredURLs.add( "http://www.moddb.com/mods/better-planets-and-backgrounds/downloads/better-asteroids" );
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=4&t=2947" );
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=12&t=11604" );
|
|
// Hissatsu's post on "Advanced Battle Systems".
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=11&t=11469&start=60#p55171" );
|
|
// SpaceDock is an app.
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=11&t=16842" );
|
|
// Beginning Scrap Advantage is bundled in GMM.
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=4&t=2464" );
|
|
// Engi Scrap Advantage is bundled in SMM.
|
|
ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=12&t=17102" );
|
|
|
|
|
|
BasicParser parser = new BasicParser();
|
|
|
|
Options options = new Options();
|
|
options.addOption( OptionBuilder.withLongOpt( "load-json" )
|
|
.withDescription( "load moddb from a json catalog" )
|
|
.hasArg()
|
|
.withArgName("FILE")
|
|
.create() );
|
|
options.addOption( OptionBuilder.withLongOpt( "load-xml" )
|
|
.withDescription( "load moddb from an xml file" )
|
|
.hasArg()
|
|
.withArgName("FILE")
|
|
.create() );
|
|
options.addOption( OptionBuilder.withLongOpt( "scrape" )
|
|
.withDescription( "write changed forum posts to an xml file" )
|
|
.hasArg()
|
|
.withArgName("FILE")
|
|
.create() );
|
|
options.addOption( OptionBuilder.withLongOpt( "dump-json" )
|
|
.withDescription( "write the moddb to a json file" )
|
|
.hasArg()
|
|
.withArgName("FILE")
|
|
.create() );
|
|
options.addOption( OptionBuilder.withLongOpt( "dump-xml" )
|
|
.withDescription( "write the moddb to an xml file" )
|
|
.hasArg()
|
|
.withArgName("FILE")
|
|
.create() );
|
|
options.addOption( OptionBuilder.withLongOpt( "hash-thread" )
|
|
.withDescription( "print the hash of a specific thread" )
|
|
.hasArg()
|
|
.withArgName("URL")
|
|
.create() );
|
|
options.addOption( OptionBuilder.withLongOpt( "first-post" )
|
|
.withDescription( "print the first post of a thread (debugging)" )
|
|
.hasArg()
|
|
.withArgName("URL")
|
|
.create() );
|
|
options.addOption( "h", "help", false, "display this help and exit" );
|
|
|
|
CommandLine cmdline = null;
|
|
try {
|
|
cmdline = parser.parse( options, args, true );
|
|
}
|
|
catch( ParseException e ) {
|
|
System.err.println( "Error parsing commandline: "+ e.getMessage() );
|
|
System.exit( 1 );
|
|
}
|
|
|
|
if ( cmdline.hasOption( "h" ) ) {
|
|
HelpFormatter formatter = new HelpFormatter();
|
|
|
|
String syntax = ForumScraper.class.getCanonicalName() +" [OPTIONS]";
|
|
|
|
String helpHeader = "Load an existing catalog as the moddb, and scrape."+ formatter.getNewLine();
|
|
helpHeader += "Edit the catalog by copy/pasting scrape snippets."+ formatter.getNewLine();
|
|
helpHeader += "Load the edited catalog and dump json."+ formatter.getNewLine();
|
|
|
|
PrintWriter pw = new PrintWriter( System.out );
|
|
formatter.printUsage( pw, formatter.getWidth(), syntax );
|
|
pw.write( helpHeader );
|
|
pw.write( formatter.getNewLine() );
|
|
formatter.printOptions( pw, formatter.getWidth(), options, formatter.getLeftPadding(), formatter.getDescPadding() );
|
|
pw.flush();
|
|
|
|
System.exit( 0 );
|
|
}
|
|
|
|
ModDB modDB = new ModDB();
|
|
|
|
try {
|
|
if ( cmdline.hasOption( "load-json" ) ) {
|
|
log.info( "Loading json catalog..." );
|
|
|
|
File srcFile = new File( cmdline.getOptionValue( "load-json" ) );
|
|
ModDB newDB = JacksonCatalogReader.parse( srcFile );
|
|
if ( newDB != null ) modDB = newDB;
|
|
}
|
|
|
|
if ( cmdline.hasOption( "load-xml" ) ) {
|
|
log.info( "Loading xml catalog..." );
|
|
|
|
File srcFile = new File( cmdline.getOptionValue( "load-xml" ) );
|
|
ModDB newDB = parseCatalogXML( srcFile );
|
|
if ( newDB != null ) modDB = newDB;
|
|
}
|
|
|
|
if ( cmdline.hasOption( "scrape" ) ) {
|
|
log.info( "Scraping..." );
|
|
|
|
File dstFile = new File( cmdline.getOptionValue( "scrape" ) );
|
|
List<ModsInfo> data = scrape( modDB, MASTER_LIST_URL, ignoredURLs );
|
|
if ( data.size() > 0 ) writeXML( data, dstFile );
|
|
}
|
|
|
|
if ( cmdline.hasOption( "dump-json" ) ) {
|
|
log.info( "Dumping json..." );
|
|
|
|
File dstFile = new File( cmdline.getOptionValue( "dump-json" ) );
|
|
List<ModsInfo> data = modDB.getCollatedModInfo();
|
|
if ( data.size() > 0 ) JacksonCatalogWriter.write( data, dstFile );
|
|
}
|
|
|
|
if ( cmdline.hasOption( "dump-xml" ) ) {
|
|
log.info( "Dumping xml..." );
|
|
|
|
File dstFile = new File( cmdline.getOptionValue( "dump-xml" ) );
|
|
List<ModsInfo> data = modDB.getCollatedModInfo();
|
|
if ( data.size() > 0 ) writeXML( data, dstFile );
|
|
}
|
|
|
|
if ( cmdline.hasOption( "hash-thread" ) ) {
|
|
log.info( "Hashing thread..." );
|
|
|
|
String threadURL = cmdline.getOptionValue( "hash-thread" );
|
|
System.out.println( hashThread( threadURL ) );
|
|
}
|
|
|
|
if ( cmdline.hasOption( "first-post" ) ) {
|
|
log.info( "Getting thread's first post..." );
|
|
|
|
String threadURL = cmdline.getOptionValue( "first-post" );
|
|
System.out.println( getFirstPost( threadURL ) );
|
|
}
|
|
}
|
|
catch ( Exception e ) {
|
|
log.error( "An error occurred.", e );
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Scrapes the forum for changed posts and returns info from updated mods.
|
|
*/
|
|
private static List<ModsInfo> scrape( ModDB knownDB, String masterListURL, List<String> ignoredURLs ) throws IOException, NoSuchAlgorithmException {
|
|
List<ModsInfo> results = new ArrayList<ModsInfo>();
|
|
|
|
List<ScrapeResult> scrapeList = scrapeMasterList( knownDB, masterListURL, ignoredURLs );
|
|
|
|
for ( ScrapeResult scrapedInfo : scrapeList ) {
|
|
ModsInfo modsInfo = new ModsInfo();
|
|
modsInfo.setTitle( scrapedInfo.title );
|
|
modsInfo.setAuthor( scrapedInfo.author );
|
|
modsInfo.setThreadURL( scrapedInfo.threadURL );
|
|
modsInfo.setThreadHash( scrapedInfo.threadHash );
|
|
modsInfo.setDescription( scrapedInfo.rawDesc );
|
|
modsInfo.putVersion( "???", "???"+ (scrapedInfo.wip ? " WIP" : "") );
|
|
results.add( modsInfo );
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
|
|
/**
|
|
* Scrape the Master Mod List on the FTL forum.
|
|
*
|
|
* If an existing ModDB is provided, its thread urls will be checked too.
|
|
*
|
|
* @param knownDB a ModDB with mods to ignore if threadHash is unchanged
|
|
* @param ignoredUrls a list of uninteresting threadURLs to ignore
|
|
*/
|
|
private static List<ScrapeResult> scrapeMasterList( ModDB knownDB, String masterListURL, List<String> ignoredURLs ) throws IOException, NoSuchAlgorithmException {
|
|
if ( ignoredURLs == null ) ignoredURLs = new ArrayList<String>();
|
|
|
|
Pattern modsHeaderPtn = Pattern.compile( Pattern.quote("<span style=\"font-weight: bold\"><span style=\"text-decoration: underline\"><span style=\"font-size: 150%; line-height: 116%;\">Mods</span></span></span>") );
|
|
Pattern modPtn = Pattern.compile( "^(?:\\[[A-Za-z0-9 ]+ *\\])?<a href=\"([^\"]+)\"[^>]*>([^>]+)</a> *((?:\\[[A-Za-z0-9 ]+\\])?)(?: (?:.*?))? - Author: <a href=\"[^\"]+\"[^>]*>([^<]+?)</a>" );
|
|
|
|
HashSet<String> boringHashes = new HashSet<String>();
|
|
if ( knownDB != null ) {
|
|
for ( ModInfo modInfo : knownDB.getCatalog() ) {
|
|
String threadHash = knownDB.getThreadHash( modInfo.getURL() );
|
|
if ( threadHash == null ) {
|
|
log.debug( "No thread hash for modInfo: "+ modInfo.getTitle() );
|
|
}
|
|
if ( threadHash != null && !threadHash.equals("???") )
|
|
boringHashes.add( threadHash );
|
|
}
|
|
}
|
|
|
|
String postContent = getFirstPost( masterListURL );
|
|
postContent = postContent.replaceAll( "<br */>", "\n" );
|
|
|
|
String[] lines = postContent.split("\n");
|
|
List<ScrapeResult> results = new ArrayList<ScrapeResult>();
|
|
List<String> pendingURLs = new ArrayList<String>();
|
|
boolean inMods = false;
|
|
Matcher m = null;
|
|
|
|
for ( String line : lines ) {
|
|
if ( modsHeaderPtn.matcher(line).find() ) {
|
|
inMods = true;
|
|
continue;
|
|
}
|
|
if ( !inMods ) continue;
|
|
|
|
m = modPtn.matcher(line);
|
|
if ( m.find() ) {
|
|
ScrapeResult result = new ScrapeResult();
|
|
result.threadURL = m.group(1);
|
|
result.title = m.group(2);
|
|
result.author = m.group(4);
|
|
result.wip = m.group(3).equals("[WIP]");
|
|
result.rawDesc = "";
|
|
result.threadHash = "???";
|
|
|
|
result.title = result.title.replaceAll( "&", "&" );
|
|
result.threadURL = result.threadURL.replaceAll( "&", "&" );
|
|
results.add( result );
|
|
}
|
|
}
|
|
if ( knownDB != null ) {
|
|
for ( ScrapeResult result : results ) {
|
|
pendingURLs.add( result.threadURL );
|
|
}
|
|
for ( ModInfo modInfo : knownDB.getCatalog() ) {
|
|
if ( !modInfo.getURL().equals("???") && !pendingURLs.contains(modInfo.getURL()) ) {
|
|
pendingURLs.add( modInfo.getURL() );
|
|
ScrapeResult result = new ScrapeResult();
|
|
result.threadURL = modInfo.getURL();
|
|
result.title = modInfo.getTitle();
|
|
result.author = modInfo.getAuthor();
|
|
result.wip = false; // *shrug*
|
|
result.rawDesc = modInfo.getDescription();
|
|
result.threadHash = knownDB.getThreadHash( modInfo.getURL() );
|
|
results.add( result );
|
|
}
|
|
}
|
|
}
|
|
|
|
// Prune results with boring urls.
|
|
for ( Iterator<ScrapeResult> it=results.iterator(); it.hasNext(); ) {
|
|
ScrapeResult result = it.next();
|
|
if ( ignoredURLs.contains( result.threadURL ) )
|
|
it.remove();
|
|
}
|
|
|
|
// Fetch and hash each thread url.
|
|
for ( int i=0; i < results.size(); i++ ) {
|
|
ScrapeResult result = results.get(i);
|
|
if ( result.threadURL.startsWith( FORUM_URL_FRAGMENT ) == false )
|
|
continue; // Don't bother scraping and hashing non-forum urls.
|
|
|
|
try {Thread.sleep( 2000 );}
|
|
catch ( InterruptedException e ) {log.info( "Inter-fetch sleep interrupted." );}
|
|
|
|
log.info( "" );
|
|
log.info( String.format( "Scraping mod %03d/%03d (%s)...", (i+1), results.size(), result.title ) );
|
|
while( true ) {
|
|
try {
|
|
result.rawDesc = getFirstPost( result.threadURL );
|
|
result.threadHash = PackUtilities.calcStreamMD5( new ByteArrayInputStream( result.rawDesc.getBytes( Charset.forName("UTF-8") ) ) );
|
|
break;
|
|
}
|
|
catch ( IOException e ) {
|
|
log.error( "Request failed: "+ e.getMessage() );
|
|
}
|
|
try {Thread.sleep( 5000 );}
|
|
catch ( InterruptedException e ) {log.error( "Re-fetch sleep interrupted.", e );}
|
|
}
|
|
}
|
|
|
|
// Ignore threads whose hashes haven't changed.
|
|
for ( Iterator<ScrapeResult> it=results.iterator(); it.hasNext(); ) {
|
|
ScrapeResult result = it.next();
|
|
if ( boringHashes.contains( result.threadHash ) )
|
|
it.remove();
|
|
}
|
|
|
|
// Scrub html out of descriptions and scrape download links.
|
|
for ( ScrapeResult result : results ) {
|
|
postContent = result.rawDesc;
|
|
postContent = postContent.replaceAll( "<br */>", "\n" );
|
|
postContent = postContent.replaceAll( "<img [^>]*/>", "" );
|
|
postContent = postContent.replaceAll( "<span [^>]*>", "" );
|
|
postContent = postContent.replaceAll( "</span>", "" );
|
|
postContent = postContent.replaceAll( """, "\"" );
|
|
postContent = postContent.replaceAll( "\u2018|\u2019", "'" );
|
|
postContent = postContent.replaceAll( "\u2022", "-" );
|
|
postContent = postContent.replaceAll( "\u2013", "-" );
|
|
postContent = postContent.replaceAll( "\u00a9", "()" );
|
|
postContent = postContent.replaceAll( "&", "&" );
|
|
postContent = postContent.replaceAll( "<a (?:[^>]+ )?href=\"([^\"]+)\"[^>]*>", "<a href=\"$1\">" );
|
|
postContent = postContent.replaceAll( "<a href=\"[^\"]+/forum/memberlist.php[^\"]+\"[^>]*>([^<]+)</a>", "$1" );
|
|
postContent = postContent.replaceAll( "<a href=\"http://(?:i.imgur.com/|[^\"]*photobucket.com/|[^\"]*deviantart.com/|www.mediafire.com/view/[?])[^\"]+\"[^>]*>([^<]+)</a>", "$1" );
|
|
postContent = postContent.replaceAll( "<a href=\"([^\"]+)\"[^>]*>(?:\\1|[^<]+ [.][.][.] [^<]+)</a>", "<a href=\"$1\">Link</a>" );
|
|
postContent = postContent.replaceAll( "<a href=\"[^\"]+[.](?:jpg|png)(?:[.]html)?\"[^>]*>([^<]*)</a>", "$1" );
|
|
postContent = postContent.replaceAll( "</li><li>", "</li>\n<li>" );
|
|
postContent = postContent.replaceAll( "<li>(.*?)</li>", " - $1" );
|
|
postContent = postContent.replaceAll( "</li>", "" );
|
|
postContent = postContent.replaceAll( "</?ul>", "" );
|
|
postContent = postContent.replaceAll( "(?s)<blockquote [^>]+><div>(.*?)</div></blockquote>", "<blockquote>$1</blockquote>" );
|
|
postContent = postContent.replaceAll( "<!-- [^>]+ -->", "" );
|
|
|
|
// Link to GMM Thread.
|
|
postContent = postContent.replaceAll( "<a href=\"[^\"]+/forum/viewtopic.php?(?:[^&]+&)*t=2464\"[^>]*>([^<]+)</a>", "$1" );
|
|
// Link to Superluminal Thread.
|
|
postContent = postContent.replaceAll( "<a href=\"[^\"]+/forum/viewtopic.php?(?:[^&]+&)*t=11251\"[^>]*>([^<]+)</a>", "$1" );
|
|
// Link to FTLEdit Thread.
|
|
postContent = postContent.replaceAll( "<a href=\"[^\"]+/forum/viewtopic.php?(?:[^&]+&)*t=2959\"[^>]*>([^<]+)</a>", "$1" );
|
|
|
|
postContent = postContent.replaceAll( "\\A\\s+", "" );
|
|
postContent = postContent.replaceAll( "\\s+\\Z", "" );
|
|
result.rawDesc = postContent +"\n"; // Raw quoting looks better with a newline.
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
|
|
/**
|
|
* Extracts the html content of the first post in a forum thread.
|
|
*/
|
|
private static String getFirstPost( String url ) throws IOException {
|
|
String htmlSrc = fetchWebPage( url );
|
|
|
|
Pattern firstPostPtn = Pattern.compile( "(?s)<div class=\"postbody\"[^>]*>.*?<div class=\"content\"[^>]*>(.*?)</div>\\s*<dl class=\"postprofile\"[^>]*>" );
|
|
Matcher m = null;
|
|
|
|
String postContent = "";
|
|
m = firstPostPtn.matcher( htmlSrc );
|
|
if ( m.find() ) {
|
|
postContent = m.group( 1 );
|
|
postContent = postContent.replaceAll( "\r?\n", "" );
|
|
|
|
// Within content, but it counts clicks/views, which throws off hashing.
|
|
postContent = postContent.replaceAll( "(?s)<div class=\"inline-attachment\">.*?</div>", "" );
|
|
|
|
// Footer junk.
|
|
//postContent = postContent.replaceAll( "(?s)<dl class=\"attachbox\">.*?<dl class=\"file\">.*?</dl>.*?</dl>", "" );
|
|
postContent = postContent.replaceAll( "(?s)<dl class=\"file\">.*?</dl>", "" );
|
|
postContent = postContent.replaceAll( "(?s)<dd>\\s*?</dd>", "" );
|
|
postContent = postContent.replaceAll( "(?s)<dl class=\"attachbox\">.*?</dl>", "" );
|
|
postContent = postContent.replaceAll( "(?s)<div (?:[^>]+ )?class=\"notice\">.*?</div>", "" );
|
|
postContent = postContent.replaceAll( "(?s)<div (?:[^>]+ )?class=\"signature\">.*?</div>", "" );
|
|
postContent = postContent.replaceAll( "</div>\\s*\\Z", "" );
|
|
postContent = postContent.replaceAll( "\\A\\s+", "" );
|
|
postContent = postContent.replaceAll( "\\s+\\Z", "" );
|
|
}
|
|
|
|
return postContent;
|
|
}
|
|
|
|
|
|
/**
|
|
* Calculates an MD5 hash of the first post in a thread.
|
|
*/
|
|
private static String hashThread( String url ) throws IOException, NoSuchAlgorithmException {
|
|
String rawDesc = getFirstPost( url );
|
|
return PackUtilities.calcStreamMD5( new ByteArrayInputStream( rawDesc.getBytes( Charset.forName("UTF-8") ) ) );
|
|
}
|
|
|
|
|
|
/**
|
|
* Downloads a URL and returns the string content, decoded as UTF-8.
|
|
*/
|
|
private static String fetchWebPage( String url ) throws IOException {
|
|
String result = null;
|
|
InputStream urlIn = null;
|
|
ByteArrayOutputStream bytesOut = null;
|
|
|
|
try {
|
|
URLConnection conn = new URL( url ).openConnection();
|
|
|
|
if ( conn instanceof HttpURLConnection == false ) {
|
|
throw new MalformedURLException( String.format( "Non-Http(s) URL given to fetch: %s", url ) );
|
|
}
|
|
HttpURLConnection httpConn = (HttpURLConnection)conn;
|
|
|
|
httpConn.setReadTimeout( 10000 );
|
|
httpConn.connect();
|
|
|
|
int responseCode = httpConn.getResponseCode();
|
|
|
|
if ( responseCode == HttpURLConnection.HTTP_OK ) {
|
|
int contentLength = conn.getContentLength();
|
|
urlIn = httpConn.getInputStream();
|
|
bytesOut = new ByteArrayOutputStream( contentLength>0 ? contentLength : 4096 );
|
|
|
|
byte[] buf = new byte[4096];
|
|
int len;
|
|
while ( (len = urlIn.read(buf)) >= 0 ) {
|
|
bytesOut.write( buf, 0, len );
|
|
}
|
|
|
|
byte[] allBytes = bytesOut.toByteArray();
|
|
CharsetDecoder decoder = Charset.forName( "UTF-8" ).newDecoder();
|
|
ByteBuffer byteBuffer = ByteBuffer.wrap( allBytes, 0, allBytes.length );
|
|
result = decoder.decode( byteBuffer ).toString();
|
|
}
|
|
}
|
|
finally {
|
|
try {if ( urlIn != null ) urlIn.close();}
|
|
catch ( IOException e ) {}
|
|
|
|
// No need to close an array stream.
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/**
|
|
* Writes collated catalog entries to a file, as human-editable xml.
|
|
*/
|
|
private static void writeXML( List<ModsInfo> data, File dstFile ) throws IOException, NoSuchAlgorithmException {
|
|
OutputStream os = null;
|
|
try {
|
|
os = new FileOutputStream( dstFile );
|
|
OutputStreamWriter writer = new OutputStreamWriter( os, Charset.forName("US-ASCII") );
|
|
writeXML( data, writer );
|
|
writer.flush();
|
|
}
|
|
finally {
|
|
try {if ( os != null ) os.close();}
|
|
catch ( IOException e ) {}
|
|
}
|
|
}
|
|
|
|
private static void writeXML( List<ModsInfo> data, OutputStreamWriter dst ) throws IOException {
|
|
boolean first = true;
|
|
dst.append( "<?xml version=\"1.0\" encoding=\""+ dst.getEncoding() +"\"?>\n" );
|
|
dst.append( "<modsinfoList>\n" );
|
|
for ( ModsInfo modsInfo : data ) {
|
|
if ( !first ) dst.append( "\n" );
|
|
|
|
writeXML( modsInfo, dst, " ", 1 );
|
|
first = false;
|
|
}
|
|
dst.append( "</modsinfoList>" );
|
|
}
|
|
|
|
private static void writeXML( ModsInfo modsInfo, OutputStreamWriter dst, String indent, int depth ) throws IOException {
|
|
Format xmlFormat = Format.getPrettyFormat();
|
|
xmlFormat.setEncoding( dst.getEncoding() );
|
|
XMLOutputter xmlOut = new XMLOutputter( xmlFormat );
|
|
|
|
writeIndent( dst, indent, depth++ ).append( "<modsinfo>\n" );
|
|
writeIndent( dst, indent, depth ); dst.append("<title>").append( xmlOut.escapeElementEntities( modsInfo.getTitle() ) ).append( "</title>\n" );
|
|
writeIndent( dst, indent, depth ); dst.append("<author>").append( xmlOut.escapeElementEntities( modsInfo.getAuthor() ) ).append( "</author>\n" );
|
|
writeIndent( dst, indent, depth ); dst.append("<threadUrl><![CDATA[ ").append( modsInfo.getThreadURL() ).append( " ]]></threadUrl>\n" );
|
|
|
|
writeIndent( dst, indent, depth++ ).append( "<versions>\n" );
|
|
for ( Map.Entry<String,String> entry : modsInfo.getVersionsMap().entrySet() ) {
|
|
String versionFileHash = entry.getKey();
|
|
String versionString = entry.getValue();
|
|
|
|
writeIndent( dst, indent, depth );
|
|
dst.append( "<version hash=\"" ).append( xmlOut.escapeAttributeEntities( versionFileHash ) ).append( "\">" );
|
|
dst.append( xmlOut.escapeElementEntities( versionString ) );
|
|
dst.append( "</version>" ).append( "\n" );
|
|
}
|
|
writeIndent( dst, indent, --depth ).append( "</versions>\n" );
|
|
writeIndent( dst, indent, depth ); dst.append("<threadHash>").append( modsInfo.getThreadHash() ).append( "</threadHash>\n" );
|
|
dst.append( "\n" );
|
|
|
|
writeIndent( dst, indent, depth ); dst.append( "<description>" ).append( "<![CDATA[" );
|
|
dst.append( modsInfo.getDescription() );
|
|
dst.append( "]]>\n" );
|
|
writeIndent( dst, indent, depth ); dst.append( "</description>\n" );
|
|
|
|
writeIndent( dst, indent, --depth ).append( "</modsinfo>\n" );
|
|
}
|
|
|
|
/**
|
|
* Adds indentation to a given depth.
|
|
*/
|
|
private static Appendable writeIndent( Appendable dst, String indent, int depth ) throws IOException {
|
|
for ( int i=0; i < depth; i++ ) dst.append( indent );
|
|
return dst;
|
|
}
|
|
|
|
|
|
/**
|
|
* Parses dumped xml and returns a new catalog.
|
|
*/
|
|
private static ModDB parseCatalogXML( File srcFile ) throws IOException, JDOMException {
|
|
ModDB modDB = new ModDB();
|
|
SAXBuilder builder = new SAXBuilder();
|
|
InputStream is = null;
|
|
|
|
try {
|
|
is = new FileInputStream( srcFile );
|
|
Document doc = builder.build( is );
|
|
Element rootNode = doc.getRootElement();
|
|
|
|
|
|
for ( Element infoNode : rootNode.getChildren( "modsinfo" ) ) {
|
|
String threadURL = infoNode.getChildTextTrim( "threadUrl" );
|
|
String threadHash = infoNode.getChildTextTrim( "threadHash" );
|
|
|
|
if ( !threadURL.equals( "???" ) && !threadHash.equals( "???" ) ) {
|
|
String oldHash = modDB.getThreadHash( threadURL );
|
|
if ( oldHash != null && !oldHash.equals( threadHash ) ) {
|
|
log.warn( "Multiple thread hashes for url: "+ threadURL );
|
|
}
|
|
modDB.putThreadHash( threadURL, threadHash );
|
|
}
|
|
|
|
for ( Element versionNode : infoNode.getChild( "versions" ).getChildren( "version" ) ) {
|
|
ModInfo modInfo = new ModInfo();
|
|
modInfo.setTitle( infoNode.getChildTextTrim( "title" ) );
|
|
modInfo.setAuthor( infoNode.getChildTextTrim( "author" ) );
|
|
modInfo.setURL( threadURL );
|
|
modInfo.setDescription( infoNode.getChildTextTrim( "description" ) );
|
|
modInfo.setFileHash( versionNode.getAttributeValue( "hash" ) );
|
|
modInfo.setVersion( versionNode.getTextTrim() );
|
|
modDB.addMod( modInfo );
|
|
}
|
|
}
|
|
}
|
|
finally {
|
|
try {if ( is != null ) is.close();}
|
|
catch ( IOException e ) {}
|
|
}
|
|
|
|
return modDB;
|
|
}
|
|
|
|
|
|
/** Information gleaned from scraping the forum. */
|
|
private static class ScrapeResult {
|
|
public String threadURL = null;
|
|
public String title = null;
|
|
public String author = null;
|
|
public boolean wip = false;
|
|
public String rawDesc = null;
|
|
public String threadHash = null;
|
|
}
|
|
}
|