/* * Ignore this package. * It's for Slipstream/GMM catalog maintenance. */ package net.vhati.modmanager.scraper; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.List; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import org.jdom2.Document; import org.jdom2.Element; import org.jdom2.JDOMException; import org.jdom2.input.SAXBuilder; import org.jdom2.output.Format; import org.jdom2.output.XMLOutputter; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.vhati.ftldat.PackUtilities; import net.vhati.modmanager.core.ModDB; import net.vhati.modmanager.core.ModInfo; import net.vhati.modmanager.core.ModsInfo; import net.vhati.modmanager.json.JacksonCatalogReader; import net.vhati.modmanager.json.JacksonCatalogWriter; public class ForumScraper { private static final Logger log = LoggerFactory.getLogger( ForumScraper.class ); private static final String MASTER_LIST_URL = "https://subsetgames.com/forum/viewtopic.php?f=11&t=2645"; private static final String FORUM_URL_FRAGMENT = "https://subsetgames.com/forum/viewtopic.php"; public static void main( String[] args ) { List ignoredURLs = new ArrayList(); ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=11&t=11561" ); ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=12&t=11083" ); ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=4&t=2938" ); ignoredURLs.add( "http://www.moddb.com/mods/better-planets-and-backgrounds/downloads/better-asteroids" ); ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=4&t=2947" ); ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=12&t=11604" ); // Hissatsu's post on "Advanced Battle Systems". ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=11&t=11469&start=60#p55171" ); // SpaceDock is an app. ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=11&t=16842" ); // Beginning Scrap Advantage is bundled in GMM. ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=4&t=2464" ); // Engi Scrap Advantage is bundled in SMM. ignoredURLs.add( "https://subsetgames.com/forum/viewtopic.php?f=12&t=17102" ); BasicParser parser = new BasicParser(); Options options = new Options(); options.addOption( OptionBuilder.withLongOpt( "load-json" ) .withDescription( "load moddb from a json catalog" ) .hasArg() .withArgName("FILE") .create() ); options.addOption( OptionBuilder.withLongOpt( "load-xml" ) .withDescription( "load moddb from an xml file" ) .hasArg() .withArgName("FILE") .create() ); options.addOption( OptionBuilder.withLongOpt( "scrape" ) .withDescription( "write changed forum posts to an xml file" ) .hasArg() .withArgName("FILE") .create() ); options.addOption( OptionBuilder.withLongOpt( "dump-json" ) .withDescription( "write the moddb to a json file" ) .hasArg() .withArgName("FILE") .create() ); options.addOption( OptionBuilder.withLongOpt( "dump-xml" ) .withDescription( "write the moddb to an xml file" ) .hasArg() .withArgName("FILE") .create() ); options.addOption( OptionBuilder.withLongOpt( "hash-thread" ) .withDescription( "print the hash of a specific thread" ) .hasArg() .withArgName("URL") .create() ); options.addOption( OptionBuilder.withLongOpt( "first-post" ) .withDescription( "print the first post of a thread (debugging)" ) .hasArg() .withArgName("URL") .create() ); options.addOption( "h", "help", false, "display this help and exit" ); CommandLine cmdline = null; try { cmdline = parser.parse( options, args, true ); } catch( ParseException e ) { System.err.println( "Error parsing commandline: "+ e.getMessage() ); System.exit( 1 ); } if ( cmdline.hasOption( "h" ) ) { HelpFormatter formatter = new HelpFormatter(); String syntax = ForumScraper.class.getCanonicalName() +" [OPTIONS]"; String helpHeader = "Load an existing catalog as the moddb, and scrape."+ formatter.getNewLine(); helpHeader += "Edit the catalog by copy/pasting scrape snippets."+ formatter.getNewLine(); helpHeader += "Load the edited catalog and dump json."+ formatter.getNewLine(); PrintWriter pw = new PrintWriter( System.out ); formatter.printUsage( pw, formatter.getWidth(), syntax ); pw.write( helpHeader ); pw.write( formatter.getNewLine() ); formatter.printOptions( pw, formatter.getWidth(), options, formatter.getLeftPadding(), formatter.getDescPadding() ); pw.flush(); System.exit( 0 ); } ModDB modDB = new ModDB(); try { if ( cmdline.hasOption( "load-json" ) ) { log.info( "Loading json catalog..." ); File srcFile = new File( cmdline.getOptionValue( "load-json" ) ); ModDB newDB = JacksonCatalogReader.parse( srcFile ); if ( newDB != null ) modDB = newDB; } if ( cmdline.hasOption( "load-xml" ) ) { log.info( "Loading xml catalog..." ); File srcFile = new File( cmdline.getOptionValue( "load-xml" ) ); ModDB newDB = parseCatalogXML( srcFile ); if ( newDB != null ) modDB = newDB; } if ( cmdline.hasOption( "scrape" ) ) { log.info( "Scraping..." ); File dstFile = new File( cmdline.getOptionValue( "scrape" ) ); List data = scrape( modDB, MASTER_LIST_URL, ignoredURLs ); if ( data.size() > 0 ) writeXML( data, dstFile ); } if ( cmdline.hasOption( "dump-json" ) ) { log.info( "Dumping json..." ); File dstFile = new File( cmdline.getOptionValue( "dump-json" ) ); List data = modDB.getCollatedModInfo(); if ( data.size() > 0 ) JacksonCatalogWriter.write( data, dstFile ); } if ( cmdline.hasOption( "dump-xml" ) ) { log.info( "Dumping xml..." ); File dstFile = new File( cmdline.getOptionValue( "dump-xml" ) ); List data = modDB.getCollatedModInfo(); if ( data.size() > 0 ) writeXML( data, dstFile ); } if ( cmdline.hasOption( "hash-thread" ) ) { log.info( "Hashing thread..." ); String threadURL = cmdline.getOptionValue( "hash-thread" ); System.out.println( hashThread( threadURL ) ); } if ( cmdline.hasOption( "first-post" ) ) { log.info( "Getting thread's first post..." ); String threadURL = cmdline.getOptionValue( "first-post" ); System.out.println( getFirstPost( threadURL ) ); } } catch ( Exception e ) { log.error( "An error occurred.", e ); } } /** * Scrapes the forum for changed posts and returns info from updated mods. */ private static List scrape( ModDB knownDB, String masterListURL, List ignoredURLs ) throws IOException, NoSuchAlgorithmException { List results = new ArrayList(); List scrapeList = scrapeMasterList( knownDB, masterListURL, ignoredURLs ); for ( ScrapeResult scrapedInfo : scrapeList ) { ModsInfo modsInfo = new ModsInfo(); modsInfo.setTitle( scrapedInfo.title ); modsInfo.setAuthor( scrapedInfo.author ); modsInfo.setThreadURL( scrapedInfo.threadURL ); modsInfo.setThreadHash( scrapedInfo.threadHash ); modsInfo.setDescription( scrapedInfo.rawDesc ); modsInfo.putVersion( "???", "???"+ (scrapedInfo.wip ? " WIP" : "") ); results.add( modsInfo ); } return results; } /** * Scrape the Master Mod List on the FTL forum. * * If an existing ModDB is provided, its thread urls will be checked too. * * @param knownDB a ModDB with mods to ignore if threadHash is unchanged * @param ignoredUrls a list of uninteresting threadURLs to ignore */ private static List scrapeMasterList( ModDB knownDB, String masterListURL, List ignoredURLs ) throws IOException, NoSuchAlgorithmException { if ( ignoredURLs == null ) ignoredURLs = new ArrayList(); Pattern modsHeaderPtn = Pattern.compile( Pattern.quote("Mods") ); Pattern modPtn = Pattern.compile( "^(?:\\[[A-Za-z0-9 ]+ *\\])?]*>([^>]+) *((?:\\[[A-Za-z0-9 ]+\\])?)(?: (?:.*?))? - Author: ]*>([^<]+?)" ); HashSet boringHashes = new HashSet(); if ( knownDB != null ) { for ( ModInfo modInfo : knownDB.getCatalog() ) { String threadHash = knownDB.getThreadHash( modInfo.getURL() ); if ( threadHash == null ) { log.debug( "No thread hash for modInfo: "+ modInfo.getTitle() ); } if ( threadHash != null && !threadHash.equals("???") ) boringHashes.add( threadHash ); } } String postContent = getFirstPost( masterListURL ); postContent = postContent.replaceAll( "
", "\n" ); String[] lines = postContent.split("\n"); List results = new ArrayList(); List pendingURLs = new ArrayList(); boolean inMods = false; Matcher m = null; for ( String line : lines ) { if ( modsHeaderPtn.matcher(line).find() ) { inMods = true; continue; } if ( !inMods ) continue; m = modPtn.matcher(line); if ( m.find() ) { ScrapeResult result = new ScrapeResult(); result.threadURL = m.group(1); result.title = m.group(2); result.author = m.group(4); result.wip = m.group(3).equals("[WIP]"); result.rawDesc = ""; result.threadHash = "???"; result.title = result.title.replaceAll( "&", "&" ); result.threadURL = result.threadURL.replaceAll( "&", "&" ); results.add( result ); } } if ( knownDB != null ) { for ( ScrapeResult result : results ) { pendingURLs.add( result.threadURL ); } for ( ModInfo modInfo : knownDB.getCatalog() ) { if ( !modInfo.getURL().equals("???") && !pendingURLs.contains(modInfo.getURL()) ) { pendingURLs.add( modInfo.getURL() ); ScrapeResult result = new ScrapeResult(); result.threadURL = modInfo.getURL(); result.title = modInfo.getTitle(); result.author = modInfo.getAuthor(); result.wip = false; // *shrug* result.rawDesc = modInfo.getDescription(); result.threadHash = knownDB.getThreadHash( modInfo.getURL() ); results.add( result ); } } } // Prune results with boring urls. for ( Iterator it=results.iterator(); it.hasNext(); ) { ScrapeResult result = it.next(); if ( ignoredURLs.contains( result.threadURL ) ) it.remove(); } // Fetch and hash each thread url. for ( int i=0; i < results.size(); i++ ) { ScrapeResult result = results.get(i); if ( result.threadURL.startsWith( FORUM_URL_FRAGMENT ) == false ) continue; // Don't bother scraping and hashing non-forum urls. try {Thread.sleep( 2000 );} catch ( InterruptedException e ) {log.info( "Inter-fetch sleep interrupted." );} log.info( "" ); log.info( String.format( "Scraping mod %03d/%03d (%s)...", (i+1), results.size(), result.title ) ); while( true ) { try { result.rawDesc = getFirstPost( result.threadURL ); result.threadHash = PackUtilities.calcStreamMD5( new ByteArrayInputStream( result.rawDesc.getBytes( Charset.forName("UTF-8") ) ) ); break; } catch ( IOException e ) { log.error( "Request failed: "+ e.getMessage() ); } try {Thread.sleep( 5000 );} catch ( InterruptedException e ) {log.error( "Re-fetch sleep interrupted.", e );} } } // Ignore threads whose hashes haven't changed. for ( Iterator it=results.iterator(); it.hasNext(); ) { ScrapeResult result = it.next(); if ( boringHashes.contains( result.threadHash ) ) it.remove(); } // Scrub html out of descriptions and scrape download links. for ( ScrapeResult result : results ) { postContent = result.rawDesc; postContent = postContent.replaceAll( "
", "\n" ); postContent = postContent.replaceAll( "]*/>", "" ); postContent = postContent.replaceAll( "]*>", "" ); postContent = postContent.replaceAll( "", "" ); postContent = postContent.replaceAll( """, "\"" ); postContent = postContent.replaceAll( "\u2018|\u2019", "'" ); postContent = postContent.replaceAll( "\u2022", "-" ); postContent = postContent.replaceAll( "\u2013", "-" ); postContent = postContent.replaceAll( "\u00a9", "()" ); postContent = postContent.replaceAll( "&", "&" ); postContent = postContent.replaceAll( "]+ )?href=\"([^\"]+)\"[^>]*>", "" ); postContent = postContent.replaceAll( "]*>([^<]+)", "$1" ); postContent = postContent.replaceAll( "]*>([^<]+)", "$1" ); postContent = postContent.replaceAll( "]*>(?:\\1|[^<]+ [.][.][.] [^<]+)", "Link" ); postContent = postContent.replaceAll( "]*>([^<]*)", "$1" ); postContent = postContent.replaceAll( "
  • ", "
  • \n
  • " ); postContent = postContent.replaceAll( "
  • (.*?)
  • ", " - $1" ); postContent = postContent.replaceAll( "", "" ); postContent = postContent.replaceAll( "", "" ); postContent = postContent.replaceAll( "(?s)
    ]+>
    (.*?)
    ", "
    $1
    " ); postContent = postContent.replaceAll( "", "" ); // Link to GMM Thread. postContent = postContent.replaceAll( "]*>([^<]+)", "$1" ); // Link to Superluminal Thread. postContent = postContent.replaceAll( "]*>([^<]+)", "$1" ); // Link to FTLEdit Thread. postContent = postContent.replaceAll( "]*>([^<]+)", "$1" ); postContent = postContent.replaceAll( "\\A\\s+", "" ); postContent = postContent.replaceAll( "\\s+\\Z", "" ); result.rawDesc = postContent +"\n"; // Raw quoting looks better with a newline. } return results; } /** * Extracts the html content of the first post in a forum thread. */ private static String getFirstPost( String url ) throws IOException { String htmlSrc = fetchWebPage( url ); Pattern firstPostPtn = Pattern.compile( "(?s)
    ]*>.*?
    ]*>(.*?)
    \\s*
    ]*>" ); Matcher m = null; String postContent = ""; m = firstPostPtn.matcher( htmlSrc ); if ( m.find() ) { postContent = m.group( 1 ); postContent = postContent.replaceAll( "\r?\n", "" ); // Within content, but it counts clicks/views, which throws off hashing. postContent = postContent.replaceAll( "(?s)
    .*?
    ", "" ); // Footer junk. //postContent = postContent.replaceAll( "(?s)
    .*?
    .*?
    .*?
    ", "" ); postContent = postContent.replaceAll( "(?s)
    .*?
    ", "" ); postContent = postContent.replaceAll( "(?s)
    \\s*?
    ", "" ); postContent = postContent.replaceAll( "(?s)
    .*?
    ", "" ); postContent = postContent.replaceAll( "(?s)
    ]+ )?class=\"notice\">.*?
    ", "" ); postContent = postContent.replaceAll( "(?s)
    ]+ )?class=\"signature\">.*?
    ", "" ); postContent = postContent.replaceAll( "
    \\s*\\Z", "" ); postContent = postContent.replaceAll( "\\A\\s+", "" ); postContent = postContent.replaceAll( "\\s+\\Z", "" ); } return postContent; } /** * Calculates an MD5 hash of the first post in a thread. */ private static String hashThread( String url ) throws IOException, NoSuchAlgorithmException { String rawDesc = getFirstPost( url ); return PackUtilities.calcStreamMD5( new ByteArrayInputStream( rawDesc.getBytes( Charset.forName("UTF-8") ) ) ); } /** * Downloads a URL and returns the string content, decoded as UTF-8. */ private static String fetchWebPage( String url ) throws IOException { String result = null; InputStream urlIn = null; ByteArrayOutputStream bytesOut = null; try { URLConnection conn = new URL( url ).openConnection(); if ( conn instanceof HttpURLConnection == false ) { throw new MalformedURLException( String.format( "Non-Http(s) URL given to fetch: %s", url ) ); } HttpURLConnection httpConn = (HttpURLConnection)conn; httpConn.setReadTimeout( 10000 ); httpConn.connect(); int responseCode = httpConn.getResponseCode(); if ( responseCode == HttpURLConnection.HTTP_OK ) { int contentLength = conn.getContentLength(); urlIn = httpConn.getInputStream(); bytesOut = new ByteArrayOutputStream( contentLength>0 ? contentLength : 4096 ); byte[] buf = new byte[4096]; int len; while ( (len = urlIn.read(buf)) >= 0 ) { bytesOut.write( buf, 0, len ); } byte[] allBytes = bytesOut.toByteArray(); CharsetDecoder decoder = Charset.forName( "UTF-8" ).newDecoder(); ByteBuffer byteBuffer = ByteBuffer.wrap( allBytes, 0, allBytes.length ); result = decoder.decode( byteBuffer ).toString(); } } finally { try {if ( urlIn != null ) urlIn.close();} catch ( IOException e ) {} // No need to close an array stream. } return result; } /** * Writes collated catalog entries to a file, as human-editable xml. */ private static void writeXML( List data, File dstFile ) throws IOException, NoSuchAlgorithmException { OutputStream os = null; try { os = new FileOutputStream( dstFile ); OutputStreamWriter writer = new OutputStreamWriter( os, Charset.forName("US-ASCII") ); writeXML( data, writer ); writer.flush(); } finally { try {if ( os != null ) os.close();} catch ( IOException e ) {} } } private static void writeXML( List data, OutputStreamWriter dst ) throws IOException { boolean first = true; dst.append( "\n" ); dst.append( "\n" ); for ( ModsInfo modsInfo : data ) { if ( !first ) dst.append( "\n" ); writeXML( modsInfo, dst, " ", 1 ); first = false; } dst.append( "" ); } private static void writeXML( ModsInfo modsInfo, OutputStreamWriter dst, String indent, int depth ) throws IOException { Format xmlFormat = Format.getPrettyFormat(); xmlFormat.setEncoding( dst.getEncoding() ); XMLOutputter xmlOut = new XMLOutputter( xmlFormat ); writeIndent( dst, indent, depth++ ).append( "\n" ); writeIndent( dst, indent, depth ); dst.append("").append( xmlOut.escapeElementEntities( modsInfo.getTitle() ) ).append( "\n" ); writeIndent( dst, indent, depth ); dst.append("").append( xmlOut.escapeElementEntities( modsInfo.getAuthor() ) ).append( "\n" ); writeIndent( dst, indent, depth ); dst.append("\n" ); writeIndent( dst, indent, depth++ ).append( "\n" ); for ( Map.Entry entry : modsInfo.getVersionsMap().entrySet() ) { String versionFileHash = entry.getKey(); String versionString = entry.getValue(); writeIndent( dst, indent, depth ); dst.append( "" ); dst.append( xmlOut.escapeElementEntities( versionString ) ); dst.append( "" ).append( "\n" ); } writeIndent( dst, indent, --depth ).append( "\n" ); writeIndent( dst, indent, depth ); dst.append("").append( modsInfo.getThreadHash() ).append( "\n" ); dst.append( "\n" ); writeIndent( dst, indent, depth ); dst.append( "" ).append( "\n" ); writeIndent( dst, indent, depth ); dst.append( "\n" ); writeIndent( dst, indent, --depth ).append( "\n" ); } /** * Adds indentation to a given depth. */ private static Appendable writeIndent( Appendable dst, String indent, int depth ) throws IOException { for ( int i=0; i < depth; i++ ) dst.append( indent ); return dst; } /** * Parses dumped xml and returns a new catalog. */ private static ModDB parseCatalogXML( File srcFile ) throws IOException, JDOMException { ModDB modDB = new ModDB(); SAXBuilder builder = new SAXBuilder(); InputStream is = null; try { is = new FileInputStream( srcFile ); Document doc = builder.build( is ); Element rootNode = doc.getRootElement(); for ( Element infoNode : rootNode.getChildren( "modsinfo" ) ) { String threadURL = infoNode.getChildTextTrim( "threadUrl" ); String threadHash = infoNode.getChildTextTrim( "threadHash" ); if ( !threadURL.equals( "???" ) && !threadHash.equals( "???" ) ) { String oldHash = modDB.getThreadHash( threadURL ); if ( oldHash != null && !oldHash.equals( threadHash ) ) { log.warn( "Multiple thread hashes for url: "+ threadURL ); } modDB.putThreadHash( threadURL, threadHash ); } for ( Element versionNode : infoNode.getChild( "versions" ).getChildren( "version" ) ) { ModInfo modInfo = new ModInfo(); modInfo.setTitle( infoNode.getChildTextTrim( "title" ) ); modInfo.setAuthor( infoNode.getChildTextTrim( "author" ) ); modInfo.setURL( threadURL ); modInfo.setDescription( infoNode.getChildTextTrim( "description" ) ); modInfo.setFileHash( versionNode.getAttributeValue( "hash" ) ); modInfo.setVersion( versionNode.getTextTrim() ); modDB.addMod( modInfo ); } } } finally { try {if ( is != null ) is.close();} catch ( IOException e ) {} } return modDB; } /** Information gleaned from scraping the forum. */ private static class ScrapeResult { public String threadURL = null; public String title = null; public String author = null; public boolean wip = false; public String rawDesc = null; public String threadHash = null; } }