From 248cc5d6aafbdd1cf27bac129e249f715bf9adbd Mon Sep 17 00:00:00 2001 From: Vhati Date: Sat, 24 Aug 2013 13:27:01 -0400 Subject: [PATCH] Added forum scraper for catalog maintenance --- .../java/net/vhati/modmanager/core/ModDB.java | 38 + .../modmanager/scraper/ForumScraper.java | 716 ++++++++++++++++++ 2 files changed, 754 insertions(+) create mode 100644 src/main/java/net/vhati/modmanager/scraper/ForumScraper.java diff --git a/src/main/java/net/vhati/modmanager/core/ModDB.java b/src/main/java/net/vhati/modmanager/core/ModDB.java index 6b2dea5..45b4ef8 100644 --- a/src/main/java/net/vhati/modmanager/core/ModDB.java +++ b/src/main/java/net/vhati/modmanager/core/ModDB.java @@ -9,6 +9,10 @@ import net.vhati.modmanager.core.ModInfo; public class ModDB { + public static final String EXACT = "exact"; + public static final String FUZZY = "fuzzy"; + + // Accociates Forum thread urls with hashes of their forst post's content. private HashMap threadHashMap = new HashMap(); @@ -37,6 +41,7 @@ public class ModDB { catalog.remove( modInfo ); } + /** * Stores the first-post content hash of a forum thread. */ @@ -59,4 +64,37 @@ public class ModDB { public List getCatalog() { return catalog; } + + + /** + * Returns ModInfos that are likely revisions in the same series. + * + * The searched item will appear in the results as well. + * + * The returned map contains two lists, keyed to constants: + * EXACT - All attributes match (excluding fileHash/fileVersion). + * FUZZY - Title and URL match, but not everything. + */ + public HashMap> getSimilarMods( ModInfo modInfo ) { + HashMap> resultsMap = new HashMap>(); + resultsMap.put( EXACT, new ArrayList() ); + resultsMap.put( FUZZY, new ArrayList() ); + + for ( ModInfo altInfo : catalog ) { + if ( altInfo.getTitle().equals( modInfo.getTitle() ) ) { + if ( altInfo.getURL().equals( modInfo.getURL() ) ) { + boolean exact = true; + + if ( !altInfo.getDescription().equals( modInfo.getDescription() ) ) + exact = false; + else if ( !altInfo.getAuthor().equals( modInfo.getAuthor() ) ) + exact = false; + + resultsMap.get( exact ? EXACT : FUZZY ).add( altInfo ); + } + } + } + + return resultsMap; + } } diff --git a/src/main/java/net/vhati/modmanager/scraper/ForumScraper.java b/src/main/java/net/vhati/modmanager/scraper/ForumScraper.java new file mode 100644 index 0000000..e5c8a6b --- /dev/null +++ b/src/main/java/net/vhati/modmanager/scraper/ForumScraper.java @@ -0,0 +1,716 @@ +/* + * Ignore this package. + * It's for Slipstream/GMM catalog maintenance. + */ + +package net.vhati.modmanager.scraper; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.List; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import net.vhati.ftldat.FTLDat; +import net.vhati.modmanager.core.ModDB; +import net.vhati.modmanager.core.ModInfo; +import net.vhati.modmanager.json.JacksonGrognakCatalogReader; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; + +import org.jdom2.Document; +import org.jdom2.Element; +import org.jdom2.JDOMException; +import org.jdom2.input.SAXBuilder; +import org.jdom2.output.Format; +import org.jdom2.output.XMLOutputter; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + +public class ForumScraper { + + private static final Logger log = LogManager.getLogger(ForumScraper.class); + + private static final String MASTER_LIST_URL = "http://www.ftlgame.com/forum/viewtopic.php?f=11&t=2645"; + private static final String FORUM_URL_FRAGMENT = "http://www.ftlgame.com/forum/viewtopic.php"; + + + public static void main( String[] args ) { + + List ignoredURLs = new ArrayList(); + ignoredURLs.add( "http://www.ftlgame.com/forum/viewtopic.php?f=11&t=11561" ); + ignoredURLs.add( "http://www.ftlgame.com/forum/viewtopic.php?f=12&t=11083" ); + ignoredURLs.add( "http://www.ftlgame.com/forum/viewtopic.php?f=4&t=2938" ); + ignoredURLs.add( "http://www.moddb.com/mods/better-planets-and-backgrounds/downloads/better-asteroids" ); + // SpaceDock is an app. + ignoredURLs.add( "http://www.ftlgame.com/forum/viewtopic.php?f=11&t=16842" ); + // Beginning Scrap Advantage is bundled in GMM. + ignoredURLs.add( "http://www.ftlgame.com/forum/viewtopic.php?f=4&t=2464" ); + + List