diff --git a/README.md b/README.md index e79b34a..e5da15d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,68 @@ -# wikipedia-categories -Cleansing Wikipedia Categories using Centrality +# Cleansing Wikipedia Categories using Centrality +## by Paolo Boldi and Corrado Monti + +We propose a novel general technique aimed at pruning and cleansing the Wikipedia category hierarchy, with a tunable level of aggregation. Our approach is endogenous, since it does not use any information coming from Wikipedia articles, but it is based solely on the user-generated (noisy) Wikipedia category folksonomy itself. We show how the proposed techniques can help reduce the level of noise in the hierarchy and discuss how alternative centrality measures can differently impact on the result. + +For more information see [the paper, presented at WWW2016 (companion), Wiki Workshop 2016, at Montreal](http://dl.acm.org/ft_gateway.cfm?id=2891111&ftid=1707848). + +# Provided dataset + +* `page2cat.tsv.gz` is a gzipped TSV file with the mapping from Wikipedia pages to cleansed categories, from the most important to the least important. +* `ranked-categories.tsv.gz` is a gzipped TSV file with every Wikipedia category and our importance score. + +We also provide head of these files to show how they look like after unzip. + +If you use the dataset or the code, please cite: +Boldi, Paolo, and Corrado Monti. "Cleansing wikipedia categories using centrality." Proceedings of the 25th International Conference Companion on World Wide Web. International World Wide Web Conferences Steering Committee, 2016. + +Bibtex: + + @inproceedings{boldi2016cleansing, + title={Cleansing wikipedia categories using centrality}, + author={Boldi, Paolo and Monti, Corrado}, + booktitle={Proceedings of the 25th International Conference Companion on World Wide Web}, + pages={969--974}, + year={2016}, + organization={International World Wide Web Conferences Steering Committee} + } + + +PLEASE NOTE: *Experiments described in the paper were run on a 2014 snapshot called +`enwiki-20140203-pages-articles.xml.bz2`, while – to provide an updated version – +this dataset refers to `enwiki-20160407-pages-articles.xml.bz2`.* + +# How to run code + +Set up the environment +---------------------- + +In order to compile the code, you'll need Java 8, Ant and Ivy. To install +them (e.g. inside a clean [Vagrant](http://vagrantup.com/) box with +`ubuntu/trusty64`), you should use these lines: + + sudo apt-get --yes update + sudo apt-get install -y software-properties-common python-software-properties + echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | sudo /usr/bin/debconf-set-selections + sudo add-apt-repository ppa:webupd8team/java -y + sudo apt-get update + sudo apt-get --yes install oracle-java8-installer + sudo apt-get --yes install oracle-java8-set-default + sudo apt-get --yes install ant ivy + sudo ln -s -T /usr/share/java/ivy.jar /usr/share/ant/lib/ivy.jar + + +Compile the code +---------------------- + +If the environment is set up properly, you should install git and download this repo with + + sudo apt-get install git + git clone https://github.com/corradomonti/wikipedia-categories.git + +and then go to the directory `java`. There, run: + +* `ant ivy-setupjars` to download dependencies +* `ant` to compile +* `. setcp.sh` to include the produced jar inside the Java classpath. + +Now you are ready to run `run.sh`. diff --git a/java/build.properties b/java/build.properties new file mode 100644 index 0000000..2f57d9a --- /dev/null +++ b/java/build.properties @@ -0,0 +1,25 @@ +version=1.0 + +build.sysclasspath=ignore + +jar.base=/usr/share/java +javadoc.base=/usr/share/javadoc + +dist=dist +src=src +test=test +slow=slow +reports=reports +coverage=coverage +checkstyle=checkstyle +docs=docs +build=build +instrumented=instr + +j2se.apiurl=http://download.oracle.com/javase/6/docs/api/ +fastutil.apiurl=http://fastutil.dsi.unimi.it/docs/ +jsap.apiurl=http://www.martiansoftware.com/jsap/doc/javadoc/ +junit.apiurl=http://junit.sourceforge.net/javadoc_40/ +log4j.apiurl=http://logging.apache.org/log4j/1.2/apidocs/ +slf4j.apiurl=http://www.slf4j.org/apidocs/ +webgraph.apiurl=http://webgraph.dsi.unimi.it/docs/ \ No newline at end of file diff --git a/java/build.xml b/java/build.xml new file mode 100644 index 0000000..23dd2ce --- /dev/null +++ b/java/build.xml @@ -0,0 +1,326 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/java/ivy.xml b/java/ivy.xml new file mode 100644 index 0000000..2db5e40 --- /dev/null +++ b/java/ivy.xml @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/java/setcp.sh b/java/setcp.sh new file mode 100755 index 0000000..bc23aa3 --- /dev/null +++ b/java/setcp.sh @@ -0,0 +1,14 @@ +JAR=wikicategories + +sourcedir=$(cd $(dirname ${BASH_ARGV[0]}) && pwd) +count=$(\ls -1 $sourcedir/$JAR-*.jar 2>/dev/null | wc -l) + +if (( count == 0 )); then + echo "WARNING: no $JAR jar file." +elif (( count > 1 )); then + echo "WARNING: several $JAR jar files ($(\ls -m $JAR-*.jar))" +else + export CLASSPATH=$(ls -1 $sourcedir/$JAR-*.jar | tail -n 1):$CLASSPATH +fi + +export CLASSPATH=$CLASSPATH:$(\ls -1 $sourcedir/jars/runtime/*.jar | paste -d: -s -) diff --git a/java/src/it/unimi/di/wikipedia/categories/CategorySelector.java b/java/src/it/unimi/di/wikipedia/categories/CategorySelector.java new file mode 100644 index 0000000..9e6b4f9 --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/categories/CategorySelector.java @@ -0,0 +1,224 @@ +package it.unimi.di.wikipedia.categories; + +import it.unimi.di.wikipedia.utils.MapUtils; +import it.unimi.dsi.Util; +import it.unimi.dsi.fastutil.ints.Int2DoubleMap; +import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; +import it.unimi.dsi.fastutil.ints.IntArrays; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.webgraph.ImmutableGraph; +import it.unimi.dsi.webgraph.Transform; +import it.unimi.dsi.webgraph.algo.GeometricCentralities; + +import java.io.PrintWriter; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import cern.colt.Arrays; + +import com.martiansoftware.jsap.FlaggedOption; +import com.martiansoftware.jsap.JSAP; +import com.martiansoftware.jsap.JSAPResult; +import com.martiansoftware.jsap.Parameter; +import com.martiansoftware.jsap.SimpleJSAP; +import com.martiansoftware.jsap.UnflaggedOption; + +public class CategorySelector { + final static Logger LOGGER = LoggerFactory.getLogger(CategorySelector.class); + + // Input data + private final ImmutableGraph wcg, transposedWcg; + private final Int2ObjectMap catId2name; + public final int numOriginalCat, numFinalCat; + public final String[] excludedStrings; + + // Output data + public int[] orderedCatIds; + private Int2DoubleMap catId2rank; + private IntSet milestones, excludedCatIds; + + public CategorySelector(ImmutableGraph wcg, Int2ObjectMap catId2name, int numFinalCat, String[] excludedStrings) { + this.wcg = wcg; + this.transposedWcg = Transform.transpose(wcg); + this.catId2name = catId2name; + this.numOriginalCat = wcg.numNodes(); + this.numFinalCat = numFinalCat; + this.excludedStrings = excludedStrings; + + LOGGER.debug("Examples from the provided Wikipedia Category Graph: "); + for (int i = 0; i < 10; i++) { + int cat = (int) (Math.random() * numOriginalCat); + LOGGER.debug( "\"" + catId2name.get(cat) + "\" is listed as a subcategory of \"" + + catId2name.get(this.wcg.successors(cat).nextInt()) + "\""); + } + } + + private static IntSet findCategoriesContainingStrings(final Int2ObjectMap catId2name, final String[] lowercasedString) { + IntSet results = new IntOpenHashSet(); + String name; + for (Int2ObjectMap.Entry c2n : catId2name.int2ObjectEntrySet()) { + name = c2n.getValue().toLowerCase(); + for (String string : lowercasedString) + if (name.indexOf(string) != -1) { + results.add(c2n.getIntKey()); + break; + } + } + return results; + } + + public void compute() { + LOGGER.info("Ranking nodes..."); + final GeometricCentralities ranker = new GeometricCentralities(transposedWcg, new ProgressLogger(LOGGER)); + try { + ranker.compute(); + } catch (InterruptedException e) { throw new RuntimeException(e); } + catId2rank = new Int2DoubleOpenHashMap(Util.identity(numOriginalCat), ranker.harmonic); + LOGGER.info("Nodes ranked."); + + LOGGER.info("Excluding categories containing " + Arrays.toString(excludedStrings) + "..."); + excludedCatIds = findCategoriesContainingStrings(catId2name, excludedStrings); + for (int catIdToExclude : excludedCatIds) + catId2rank.put(catIdToExclude, Double.NEGATIVE_INFINITY); + LOGGER.info(excludedCatIds.size() + " categories excluded, e.g. \"" + catId2name.get(excludedCatIds.toIntArray()[0]) + "\"."); + + LOGGER.info("Ordering categories by centrality and selecting milestones..."); + orderedCatIds = Util.identity(numOriginalCat); + IntArrays.quickSort(orderedCatIds, MapUtils.comparatorPuttingLargestMappedValueFirst(catId2rank)); + milestones = new IntOpenHashSet(IntArrays.trim(orderedCatIds, numFinalCat)); + LOGGER.info(milestones.size() + " milestones selected. 1st category: " + catId2name.get(orderedCatIds[0])); + } + + public Int2ObjectMap recategorize(final Int2ObjectMap page2cat) { + LOGGER.info("Computing closest milestones..."); + final int[] closestMilestones = new HittingDistanceMinimizer(transposedWcg, milestones).compute(); + LOGGER.info("Closest milestones computed, printing a sample:"); + for (int i = 0; i < 10; i++) { + int cat = (int) (Math.random() * numOriginalCat); + System.out.println( "\"" + catId2name.get(cat) + "\" has been ramapped to \"" + + catId2name.get(closestMilestones[cat]) + "\""); + } + + ProgressLogger pl = new ProgressLogger(LOGGER, "pages"); + pl.expectedUpdates = page2cat.size(); + pl.start("Moving old categories to closest milestones..."); + Int2ObjectMap page2newCat = new Int2ObjectOpenHashMap(page2cat.size()); + for (Int2ObjectMap.Entry p2c : page2cat.int2ObjectEntrySet()) { + IntSet newCategories = new IntOpenHashSet(); + int milestone; + for (int cat : p2c.getValue()) { + if (cat < 0 || cat >= numOriginalCat) + LOGGER.error("Category #" + cat + " is not listed in the Wikipedia Category Graph" + + " (it has only " + numOriginalCat + " nodes)."); + else { + milestone = closestMilestones[cat]; + if (milestone != -1) { + if (!milestones.contains(milestone)) + throw new IllegalStateException(milestone + " is not a milestone."); + newCategories.add(milestone); + } + } + } + page2newCat.put(p2c.getIntKey(), newCategories); + pl.lightUpdate(); + } + pl.done(); + + return page2newCat; + } + + private String[] toSortedNames(IntSet categories) { + String[] names = new String[categories.size()]; + int[] sortedCat = categories.toIntArray(); + IntArrays.quickSort(sortedCat, MapUtils.comparatorPuttingLargestMappedValueFirst(catId2rank)); + for (int i = 0; i < sortedCat.length; i++) names[i] = catId2name.get(sortedCat[i]); + return names; + } + + @SuppressWarnings({ "unchecked" }) + public static void main( String rawArguments[] ) throws Exception { + SimpleJSAP jsap = new SimpleJSAP( CategorySelector.class.getName(), + "Cleanse the wikipedia categorization system.", + new Parameter[] { + new UnflaggedOption( "WCG", + JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, + "The BVGraph basename of the wikipedia category graph." ), + new UnflaggedOption( "page2cat", + JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, + "The serialized int 2 intset that represents set of categories for each page." ), + new UnflaggedOption( "pageNames", + JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, + "The serialized Int2ObjectMap file with association of categories to their names." ), + new UnflaggedOption( "catNames", + JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, + "The serialized Int2ObjectMap file with association of categories to their names." ), + new FlaggedOption( "exclude", + JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, + 'e', "exclude", + "Exclude all those categories whose LOWERCASED name contains one of the provided strings." ) + .setAllowMultipleDeclarations(true), + new UnflaggedOption( "C", + JSAP.INTEGER_PARSER, "10000", JSAP.REQUIRED, JSAP.NOT_GREEDY, + "Number of categories to retain." ), + new UnflaggedOption( "output-rankedcat", + JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, + "Where the output (ordered) category 2 score TSV file will be saved." + ), + new UnflaggedOption( "output-page2cat", + JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, + "Where the output page2cat TSV file will be saved." + ), + } + ); + + final JSAPResult args = jsap.parse( rawArguments ); + if ( jsap.messagePrinted() ) System.exit( 1 ); + + LOGGER.info("Reading input files..."); + Int2ObjectMap catNames = (Int2ObjectMap) BinIO.loadObject(args.getString("catNames")); + Int2ObjectMap pageNames = (Int2ObjectMap) BinIO.loadObject(args.getString("pageNames")); + Int2ObjectMap page2cat = (Int2ObjectMap) BinIO.loadObject(args.getString("page2cat")); + ImmutableGraph wcg = ImmutableGraph.load(args.getString("WCG")); + final int numFinalCat = args.getInt("C"); + + CategorySelector categorySelector = new CategorySelector(wcg, catNames, numFinalCat, args.getStringArray("exclude")); + categorySelector.compute(); + + LOGGER.info("Writing rankings to " + args.getString("output-rankedcat") + "..."); + PrintWriter out = new PrintWriter(args.getString("output-rankedcat")); + for (int c : categorySelector.orderedCatIds) { + out.print(catNames.get(c)); + out.print("\t"); + out.print(Double.toString(categorySelector.catId2rank.get(c))); + out.println(); + } + out.close(); + + + Int2ObjectMap newPage2cat = categorySelector.recategorize(page2cat); + + LOGGER.info("Writing new page2cat map to " + args.getString("output-page2cat") + "..."); + out = new PrintWriter(args.getString("output-page2cat")); + ProgressLogger pl = new ProgressLogger(LOGGER, "pages"); + for (Int2ObjectMap.Entry p2c : newPage2cat.int2ObjectEntrySet()) { + out.print(pageNames.get(p2c.getIntKey())); + out.print("\t"); + for (String c : categorySelector.toSortedNames(p2c.getValue())) out.print(c + "\t"); + out.println(); + pl.lightUpdate(); + } + pl.done(); + out.close(); + + } + + + + +} diff --git a/java/src/it/unimi/di/wikipedia/categories/HittingDistanceMinimizer.java b/java/src/it/unimi/di/wikipedia/categories/HittingDistanceMinimizer.java new file mode 100644 index 0000000..fa7defb --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/categories/HittingDistanceMinimizer.java @@ -0,0 +1,150 @@ +package it.unimi.di.wikipedia.categories; + +import it.unimi.dsi.fastutil.ints.IntArrayFIFOQueue; +import it.unimi.dsi.fastutil.ints.IntArrayPriorityQueue; +import it.unimi.dsi.fastutil.ints.IntPriorityQueue; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import it.unimi.dsi.fastutil.objects.ObjectSet; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.webgraph.ImmutableGraph; +import it.unimi.dsi.webgraph.LazyIntIterator; + +import java.util.Arrays; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class HittingDistanceMinimizer { + public static final Logger LOGGER = LoggerFactory.getLogger(HittingDistanceMinimizer.class); + + final ImmutableGraph transposed; + final int[] minMilestoneDistance; + final int[] closestMilestone; + final IntSet milestones; + final ObjectSet runningVisitors; + final IntPriorityQueue milestoneQueue; + final ProgressLogger pl; + + public HittingDistanceMinimizer(ImmutableGraph transposedGraph, IntSet milestones) { + this.transposed = transposedGraph; + this.milestones = milestones; + minMilestoneDistance = new int[transposedGraph.numNodes()]; + Arrays.fill(minMilestoneDistance, Integer.MAX_VALUE); + closestMilestone = new int[transposedGraph.numNodes()]; + Arrays.fill(closestMilestone, -1); + milestoneQueue = new IntArrayPriorityQueue(milestones.toIntArray()); + runningVisitors = new ObjectOpenHashSet(); + pl = new ProgressLogger(LOGGER, "milestones"); + pl.expectedUpdates = milestones.size(); + + } + + private class Visitor extends Thread { + + final int start; + final int[] dists; + final ImmutableGraph graph; + + Visitor(final ImmutableGraph graph, int startingNode) { + this.start = startingNode; + dists = new int[ graph.numNodes() ]; + this.graph = graph.copy(); + } + + @Override + public void run() { + final IntArrayFIFOQueue queue = new IntArrayFIFOQueue(); + + Arrays.fill( dists, Integer.MAX_VALUE ); // Initially, all distances are infinity. + + int curr, succ; + queue.enqueue( start ); + dists[ start ] = 0; + + LazyIntIterator successors; + + while( ! queue.isEmpty() ) { + curr = queue.dequeueInt(); + successors = graph.successors( curr ); + int d = graph.outdegree( curr ); + while( d-- != 0 ) { + succ = successors.nextInt(); + if ( dists[ succ ] == Integer.MAX_VALUE ) { + dists[ succ ] = dists[ curr ] + 1; + queue.enqueue( succ ); + } + } + } + + startNewThreadAfter(this); + } + + @Override + public int hashCode() { return start; } + + @Override + public boolean equals(Object o) { + return (((o instanceof Visitor)) && ((Visitor) o).start == this.start); + } + } + + private synchronized void startNewThreadAfter(Visitor thread) { + if (thread != null) { + if (!runningVisitors.remove(thread)) { + throw new IllegalStateException( + "Thread " + thread + " signaled completion but was not present."); + } + updateClosestMilestonesAfter(thread.start, thread.dists); + pl.update(); + } + + if (!milestoneQueue.isEmpty()) { + int milestone = milestoneQueue.dequeueInt(); + Visitor visitor = new Visitor(transposed, milestone); + runningVisitors.add(visitor); + visitor.start(); + } else + if (runningVisitors.isEmpty()) { + synchronized (this) { + this.notifyAll(); + } + } + } + + + private void updateClosestMilestonesAfter(int milestone, int[] distances) { + final int numNodes = transposed.numNodes(); + for (int node = 0; node < numNodes; node++) { + if (distances[node] < minMilestoneDistance[node]) { + minMilestoneDistance[node] = distances[node]; + closestMilestone[node] = milestone; + } + } + } + + public int[] compute() { + return compute(Runtime.getRuntime().availableProcessors()); + } + + public int[] compute(int nOfThreads) { + pl.start("Starting a BFS for each milestone (with " + nOfThreads + " parallel threads)..."); + for (int i = 0; i < nOfThreads; i++) { + startNewThreadAfter(null); + } + try { + synchronized (this) { + while (!milestoneQueue.isEmpty()) + this.wait(); + } + } catch (InterruptedException e) { throw new RuntimeException(e); } + + pl.done(); + + return closestMilestone; + + } + + +} diff --git a/java/src/it/unimi/di/wikipedia/parsing/DocumentSequenceImmutableGraph.java b/java/src/it/unimi/di/wikipedia/parsing/DocumentSequenceImmutableGraph.java new file mode 100644 index 0000000..917091b --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/parsing/DocumentSequenceImmutableGraph.java @@ -0,0 +1,137 @@ +package it.unimi.di.wikipedia.parsing; + +import it.unimi.di.big.mg4j.document.Document; +import it.unimi.di.big.mg4j.document.DocumentIterator; +import it.unimi.di.big.mg4j.document.DocumentSequence; +import it.unimi.di.big.mg4j.tool.Scan; +import it.unimi.di.big.mg4j.tool.Scan.VirtualDocumentFragment; +import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver; +import it.unimi.dsi.fastutil.longs.LongAVLTreeSet; +import it.unimi.dsi.fastutil.longs.LongSortedSet; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.objects.ObjectIterator; +import it.unimi.dsi.fastutil.objects.ObjectList; +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.ImmutableSequentialGraph; +import it.unimi.dsi.big.webgraph.NodeIterator; + +import java.io.IOException; +import java.util.NoSuchElementException; + +/** Exposes a document sequence as a (sequentially accessible) immutable graph, according to some +* virtual field provided by the documents in the sequence. A suitable {@link VirtualDocumentResolver} +* is used to associate node numbers to each fragment. +* +*

More precisely, the graph will have as many nodes as there are documents in the sequence, the +* k-th document (starting from 0) representing node number k. +* The successors of a document are obtained by extracting the virtual field from the +* document, turning each {@linkplain it.unimi.di.mg4j.tool.Scan.VirtualDocumentFragment document specifier} +* into a document number (using the given {@linkplain VirtualDocumentResolver resolver}, +* and discarding unresolved URLs). +*/ +public class DocumentSequenceImmutableGraph extends ImmutableSequentialGraph { + + /** The underlying sequence. */ + private DocumentSequence sequence; + /** The number of the virtual field to be used. */ + private int virtualField; + /** The resolver to be used. */ + private VirtualDocumentResolver resolver; + + /** Creates an immutable graph from a sequence. + * + * @param sequence the sequence whence the immutable graph should be created. + * @param virtualField the number of the virtual field to be used to get the successors from. + * @param resolver the resolver to be used to map document specs to node numbers. + */ + public DocumentSequenceImmutableGraph( final DocumentSequence sequence, final int virtualField, final VirtualDocumentResolver resolver ) { + this.sequence = sequence; + this.virtualField = virtualField; + this.resolver = resolver; + } + + /** Creates a new immutable graph with the specified arguments. + * + * @param arg a 3-element array: the first is the basename of a {@link DocumentSequence}, the second is an integer specifying the virtual + * field number, the third is the basename of a {@link VirtualDocumentResolver}. + */ + public DocumentSequenceImmutableGraph( final String... arg ) throws IOException, ClassNotFoundException { + this( (DocumentSequence)BinIO.loadObject( arg[ 0 ] ), Integer.parseInt( arg[ 1 ] ), (VirtualDocumentResolver)BinIO.loadObject( arg[ 2 ] ) ); + } + + @Override + public ImmutableGraph copy() { + throw new UnsupportedOperationException(); + } + + @Override + public long numNodes() { + if ( resolver.numberOfDocuments() > Integer.MAX_VALUE ) throw new IllegalArgumentException(); + return resolver.numberOfDocuments(); + } + + @Override + public boolean randomAccess() { + return false; + } + + public NodeIterator nodeIterator() { + try { + final DocumentIterator documentIterator = sequence.iterator(); + return new NodeIterator() { + Document cachedDocument = documentIterator.nextDocument(); + int cachedDocumentNumber = 0; + long[] cachedSuccessors; + LongSortedSet succ = new LongAVLTreeSet(); + + public boolean hasNext() { + return cachedDocument != null; + } + + @SuppressWarnings("unchecked") + public long nextLong() { + if ( !hasNext() ) throw new NoSuchElementException(); + ObjectList vdf; + try { + vdf = (ObjectList)cachedDocument.content( virtualField ); + } + catch ( IOException exc1 ) { + throw new RuntimeException( exc1 ); + } + succ.clear(); + resolver.context( cachedDocument ); + ObjectIterator it = vdf.iterator(); + while ( it.hasNext() ) { + long successor = resolver.resolve( it.next().documentSpecifier() ); + if ( successor >= 0 ) succ.add( successor ); + } + cachedSuccessors = succ.toLongArray(); + // Get ready for the next request + try { + cachedDocument.close(); + cachedDocument = documentIterator.nextDocument(); + } + catch ( IOException e ) { + throw new RuntimeException( e ); + } + return cachedDocumentNumber++; + } + + public long outdegree() { + return cachedSuccessors.length; + } + + + public long[][] successorBigArray() { + return new long[][] {cachedSuccessors}; + } + + }; + } + catch ( IOException e ) { + throw new RuntimeException( e ); + } + + } + +} diff --git a/java/src/it/unimi/di/wikipedia/parsing/NamespacedWikipediaDocumentSequence.java b/java/src/it/unimi/di/wikipedia/parsing/NamespacedWikipediaDocumentSequence.java new file mode 100644 index 0000000..90f5a1e --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/parsing/NamespacedWikipediaDocumentSequence.java @@ -0,0 +1,860 @@ +package it.unimi.di.wikipedia.parsing; + + +/* + * Modified version of: + * + * MG4J: Managing Gigabytes for Java (big) + * + * Copyright (C) 2013 Sebastiano Vigna + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, see . + * + */ + +import info.bliki.wiki.filter.Encoder; +import info.bliki.wiki.filter.HTMLConverter; +import info.bliki.wiki.filter.PlainTextConverter; +import info.bliki.wiki.model.WikiModel; +import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap; +import it.unimi.dsi.big.util.StringMap; +import it.unimi.di.big.mg4j.document.AbstractDocument; +import it.unimi.di.big.mg4j.document.AbstractDocumentFactory; +import it.unimi.di.big.mg4j.document.AbstractDocumentIterator; +import it.unimi.di.big.mg4j.document.AbstractDocumentSequence; +import it.unimi.di.big.mg4j.document.CompositeDocumentFactory; +import it.unimi.di.big.mg4j.document.Document; +import it.unimi.di.big.mg4j.document.DocumentFactory; +import it.unimi.di.big.mg4j.document.DocumentIterator; +import it.unimi.di.big.mg4j.document.DocumentSequence; +import it.unimi.di.big.mg4j.document.HtmlDocumentFactory; +import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory; +import it.unimi.di.big.mg4j.document.WikipediaDocumentCollection; +import it.unimi.di.big.mg4j.tool.URLMPHVirtualDocumentResolver; +import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver; +import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor; +import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor.Anchor; +import it.unimi.dsi.bits.TransformationStrategies; +import it.unimi.dsi.bits.TransformationStrategy; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.fastutil.objects.ObjectBigList; +import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import it.unimi.dsi.fastutil.objects.Reference2ObjectMap; +import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap; +import it.unimi.dsi.io.FastBufferedReader; +import it.unimi.dsi.io.FileLinesCollection; +import it.unimi.dsi.io.WordReader; +import it.unimi.dsi.lang.MutableString; +import it.unimi.dsi.lang.ObjectParser; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.sux4j.mph.MWHCFunction; +import it.unimi.dsi.util.TextPattern; +import it.unimi.dsi.webgraph.ImmutableGraph; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Serializable; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ArrayBlockingQueue; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import com.google.common.base.Charsets; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.html.HtmlEscapers; +import com.martiansoftware.jsap.FlaggedOption; +import com.martiansoftware.jsap.JSAP; +import com.martiansoftware.jsap.JSAPException; +import com.martiansoftware.jsap.JSAPResult; +import com.martiansoftware.jsap.Parameter; +import com.martiansoftware.jsap.SimpleJSAP; +import com.martiansoftware.jsap.Switch; +import com.martiansoftware.jsap.UnflaggedOption; + +/** A class exhibiting a standard Wikipedia XML dump as a {@link DocumentSequence}. + * + *

Warning: this class has no connection whatsoever with + * {@link WikipediaDocumentCollection}. + * + *

The purpose of this class is making the indexing of Wikipedia and of its entity + * graph starting from a pristine Wikipedia XML dump reasonably easy. There are a few + * steps involved, mainly due to the necessity of working out redirects, but the whole + * procedure can be carried out with very little resources. The class uses the + * {@link WikiModel#toHtml(String, Appendable, String, String)} method to convert + * the Wikipedia format into HTML, and then passes the result to a standard {@link HtmlDocumentFactory} + * (suggestion on alternative conversion methods are welcome). + * A few additional fields are handled by {@link WikipediaHeaderFactory}. + * + *

Note that no properties are passed to the underlying {@link HtmlDocumentFactory}: if you want + * to set the anchor properties (see {@link HtmlDocumentFactory.MetadataKeys}), you need to use + * {@linkplain #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean, int, int, int) a quite humongous constructor}. + * + *

How to index Wikipedia

+ * + *

As a first step, download the Wikipedia XML dump (it's the “pages-articles” file; + * it should start with a mediawiki opening tag). This class can process the + * file in its compressed form, but we suggest to uncompress it using bunzip2, + * as processing is an order of magnitude faster. (Note that the following process will exclude namespaced + * pages such as Template:something; if you want to include them, you must + * use a different {@linkplain #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean) constructor}.) + * + * + *

The first step is extracting metadata (in particular, the URLs that are necessary to + * index correctly the anchor text). We do not suggest specific Java options, but try to use + * as much memory as you can. + *

+ * java it.unimi.di.mg4j.tool.ScanMetadata \
+ *   -o "it.unimi.di.mg4j.document.WikipediaDocumentSequence(enwiki-latest-pages-articles.xml,false,http://en.wikipedia.org/wiki/,false)" \
+ *   -u enwiki.uris -t enwiki.titles
+ * 
+ * + *

Note that we used the {@link ObjectParser}-based constructor of this class, which makes it possible to create + * a {@link NamespacedWikipediaDocumentSequence} instance parsing a textual specification (see the + * {@linkplain #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean) constructor} + * documentation for details about the parameters). + * + *

The second step consists in building a first {@link VirtualDocumentResolver} which, however, + * does not comprise redirect information: + *

+ * java it.unimi.di.mg4j.tool.URLMPHVirtualDocumentResolver -o enwiki.uris enwiki.vdr
+ * 
+ * + *

Now we need to use the ad hoc main method of this class to rescan the collection, gather the redirect + * information and merge it with our current resolver: + *

+ * java it.unimi.di.mg4j.document.WikipediaDocumentSequence \
+ *   enwiki-latest-pages-articles.xml http://en.wikipedia.org/wiki/ enwiki.uris enwiki.vdr enwikired.vdr
+ * 
+ * + *

During this phase a quite large number of warnings about failed redirects might appear. This is normal, + * in particular if you do not index template pages. If you suspect an actual bug, try first to index template pages, + * too. Failed redirects should be in the order of few thousands, and all due to internal inconsistencies of + * the dump: to check that this is the case, check whether the target of a failed redirect appears as a page + * title (it shouldn't). + * + *

We have now all information required to build a complete index (we use the Porter2 stemmer in this example): + *

+ * java it.unimi.di.mg4j.tool.IndexBuilder \
+ *   -o "it.unimi.di.mg4j.document.WikipediaDocumentSequence(enwiki.xml,false,http://en.wikipedia.org/wiki/,true)" \ 
+ *   --all-fields -v enwiki.vdr -t EnglishStemmer enwiki
+ * 
+ * + *

Finally, we can build the entity graph using a bridge class that exposes any {@link DocumentSequence} with a virtual + * field as an {@link ImmutableGraph} of the WebGraph framework (the nodes will be in one-to-one correspondence with the documents + * returned by the index): + *

+ * java it.unimi.dsi.big.webgraph.BVGraph \
+ *   -s "it.unimi.di.mg4j.util.DocumentSequenceImmutableSequentialGraph(\"it.unimi.di.mg4j.document.WikipediaDocumentSequence(enwiki.xml,false,http://en.wikipedia.org/wiki/,true)\",anchor,enwikired.vdr)" \ 
+ *   enwiki
+ * 
+ * + *

Additional fields

+ * + *

The additional fields generated by this class (some of which are a bit hacky) are: + * + *

+ *
title + *
the title of the Wikipedia page; + *
id + *
a payload index containing the Wikipedia identifier of the page; + *
lastedit + *
a payload index containing the last edit of the page; + *
category + *
a field containing the categories of the page, separated by an artificial marker OXOXO (so when you look for a category as a phrase you + * don't get false cross-category positives); + *
firstpar + *
a heuristically generated first paragraph of the page, useful for identification beyond the title; + *
redirects + *
a virtual field treating the link of the page with its title and any redirect link to the page as an anchor: in practice, the + * field contains all names under which the page is known in Wikipedia. + *
+ * + *

Note that for each link in a disambiguation page this class will generate a fake link with the same target, but + * the title of the disambiguation page as text. This is in the same spirit of the redirects field—we enrich + * the HTML anchor field with useful information without altering the generated graph. + */ + +public class NamespacedWikipediaDocumentSequence extends AbstractDocumentSequence implements Serializable { + private static final Logger LOGGER = LoggerFactory.getLogger( NamespacedWikipediaDocumentSequence.class ); + private static final long serialVersionUID = 1L; + + private static final TextPattern CATEGORY_START = new TextPattern( "[[Category:" ); + private static final TextPattern BRACKETS_CLOSED = new TextPattern( "]]" ); + private static final TextPattern BRACES_CLOSED = new TextPattern( "}}" ); + private static final TextPattern DISAMBIGUATION = new TextPattern( "{{disambiguation" ); + private static final TextPattern BRACKETS_OPEN = new TextPattern( "[[" ); + private static final char[] END_OF_DISAMBIGUATION_LINK = new char[] { '|', ']' }; + + /** A marker used to denote end of input. */ + private static final DocumentAndFactory END = new DocumentAndFactory( null, null ); + /** The prototype {@link CompositeDocumentFactory} used to parse Wikipedia pages. */ + private final DocumentFactory factory; + /** Whether the input is compressed with bzip2. */ + private final boolean bzipped; + /** Whether to parse text (e.g., we do not parse text when computing titles/URIs). */ + private final boolean parseText; + /** Whether to keep in the index namespace pages. */ + private final boolean keepNamespaced; + /** The Wikipedia XML dump. */ + private final String wikipediaXmlDump; + /** The base URL for pages (e.g., http://en.wikipedi.org/wiki/). */ + private final String baseURL; + /** {@link #baseURL} concatenated with ${title}. */ + private final String linkBaseURL; + /** {@link #baseURL} concatenated with ${image}. */ + private final String imageBaseURL; + /** The set of namespaces specified in {@link #wikipediaXmlDump}. */ + private ImmutableSet nameSpaces; + /** This list (whose access must be synchronized) accumulates virtual text (anchors) generated by redirects. + * It is filled when meeting redirect pages, and it is emptied at the first non-redirect page (the page in which the list + * is emptied is immaterial). Note that because of this setup, if there are some redirect + * pages that are not followed by any indexed page the anchors of those redirects won't be processed at all. + * If this is a problem, just add a fake empty page at the end. */ + private final ObjectArrayList redirectAnchors = new ObjectArrayList(); + + public static enum MetadataKeys { + ID, + LASTEDIT, + CATEGORY, + FIRSTPAR, + /** This key is used internally by {@link WikipediaHeaderFactory} and is associated with the list of redirect anchors. */ + REDIRECT + }; + + /** A factory responsible for special Wikipedia fields (see the {@linkplain NamespacedWikipediaDocumentSequence class documentation}). It + * will be {@linkplain CompositeDocumentFactory composed} with an {@link HtmlDocumentFactory}. */ + public static final class WikipediaHeaderFactory extends AbstractDocumentFactory { + private static final long serialVersionUID = 1L; + private static final Object2IntOpenHashMap FIELD_2_INDEX = new Object2IntOpenHashMap( new String[] { "title", "id", "lastedit", "category", "firstpar", "redirect" }, new int[] { 0, 1, 2, 3, 4, 5 } ); + static { + FIELD_2_INDEX.defaultReturnValue( -1 ); + } + + private final WordReader wordReader = new FastBufferedReader(); + + @Override + public int numberOfFields() { + return 6; + } + + @Override + public String fieldName( int field ) { + switch( field ) { + case 0: return "title"; + case 1: return "id"; + case 2: return "lastedit"; + case 3: return "category"; + case 4: return "firstpar"; + case 5: return "redirect"; + default: throw new IllegalArgumentException(); + } + } + + @Override + public int fieldIndex( String fieldName ) { + return FIELD_2_INDEX.getInt( fieldName ); + } + + @Override + public FieldType fieldType( int field ) { + switch( field ) { + case 0: return FieldType.TEXT; + case 1: return FieldType.INT; + case 2: return FieldType.DATE; + case 3: return FieldType.TEXT; + case 4: return FieldType.TEXT; + case 5: return FieldType.VIRTUAL; + default: throw new IllegalArgumentException(); + } + } + + @Override + public Document getDocument( final InputStream unusedRawContent, final Reference2ObjectMap, Object> metadata ) throws IOException { + return new AbstractDocument() { + + @Override + public WordReader wordReader( int field ) { + return wordReader; // Fixed, for the time being. + } + + @Override + public CharSequence uri() { + return (CharSequence)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.URI ); + } + + @Override + public CharSequence title() { + return (CharSequence)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.TITLE ); + } + + @Override + public Object content( final int field ) throws IOException { + switch( field ) { + case 0: return new FastBufferedReader( (MutableString)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.TITLE ) ); + case 1: return metadata.get( MetadataKeys.ID ); + case 2: return metadata.get( MetadataKeys.LASTEDIT ); + case 3: return new FastBufferedReader( (MutableString)metadata.get( MetadataKeys.CATEGORY ) ); + case 4: return new FastBufferedReader( (MutableString)metadata.get( MetadataKeys.FIRSTPAR ) ); + case 5: + @SuppressWarnings("unchecked") + final ObjectArrayList redirectAnchors = (ObjectArrayList)metadata.get( MetadataKeys.REDIRECT ); + ImmutableList result; + + synchronized( redirectAnchors ) { + redirectAnchors.add( new Anchor( (MutableString)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.URI ), (MutableString)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.TITLE ) ) ); + result = ImmutableList.copyOf( redirectAnchors ); + redirectAnchors.clear(); + } + // System.err.println( "Adding " + result ); + return result; + default: throw new IllegalArgumentException(); + } + } + }; + } + + @Override + public DocumentFactory copy() { + return new WikipediaHeaderFactory(); + } + + } + + /** Builds a new Wikipedia document sequence that discards namespaced pages. + * + * @param file the file containing the Wikipedia dump. + * @param bzipped whether {@code file} is compressed with bzip2. + * @param baseURL a base URL for links (e.g., for the English Wikipedia, http://en.wikipedia.org/wiki/); + * note that if it is nonempty this string must terminate with a slash. + * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning + * phases to speed up the scanning process). + */ + public NamespacedWikipediaDocumentSequence( final String file, final boolean bzipped, final String baseURL, final boolean parseText) { + this( file, bzipped, baseURL, parseText, false ); + } + + /** Builds a new Wikipedia document sequence using default anchor settings. + * + * @param file the file containing the Wikipedia dump. + * @param bzipped whether {@code file} is compressed with bzip2. + * @param baseURL a base URL for links (e.g., for the English Wikipedia, http://en.wikipedia.org/wiki/); + * note that if it is nonempty this string must terminate with a slash. + * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning + * phases to speed up the scanning process). + * @param keepNamespaced whether to keep namespaced pages (e.g., Template:something pages). + */ + public NamespacedWikipediaDocumentSequence( final String file, final boolean bzipped, final String baseURL, final boolean parseText, final boolean keepNamespaced) { + this( file, bzipped, baseURL, parseText, keepNamespaced, 8, 8, 8); + } + + /** Builds a new Wikipedia document sequence. + * + * @param file the file containing the Wikipedia dump. + * @param bzipped whether {@code file} is compressed with bzip2. + * @param baseURL a base URL for links (e.g., for the English Wikipedia, http://en.wikipedia.org/wiki/); + * note that if it is nonempty this string must terminate with a slash. + * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning + * phases to speed up the scanning process). + * @param keepNamespaced whether to keep namespaced pages (e.g., Template:something pages). + * @param maxPreAnchor maximum number of character before an anchor. + * @param maxAnchor maximum number of character in an anchor. + * @param maxPostAnchor maximum number of characters after an anchor. + */ + public NamespacedWikipediaDocumentSequence( final String file, final boolean bzipped, final String baseURL, final boolean parseText, final boolean keepNamespaced, final int maxPreAnchor, final int maxAnchor, final int maxPostAnchor) { + this.wikipediaXmlDump = file; + this.bzipped = bzipped; + this.baseURL = baseURL; + this.parseText = parseText; + this.keepNamespaced = keepNamespaced; + Reference2ObjectOpenHashMap, Object> metadata = new Reference2ObjectOpenHashMap, Object>( + new Enum[] { HtmlDocumentFactory.MetadataKeys.MAXPREANCHOR, HtmlDocumentFactory.MetadataKeys.MAXANCHOR, HtmlDocumentFactory.MetadataKeys.MAXPOSTANCHOR }, + new Integer[] { Integer.valueOf( maxPreAnchor ), Integer.valueOf( maxAnchor ), Integer.valueOf( maxPostAnchor ) } + ); + DocumentFactory htmlDocumentFactory = new HtmlDocumentFactory(metadata); + this.factory = CompositeDocumentFactory.getFactory( new DocumentFactory[] { new WikipediaHeaderFactory(), htmlDocumentFactory }, new String[] { "title", "id", "lastedit", "category", "firstpar", "redirect", "text", "dummy", "anchor" } ); + linkBaseURL = baseURL + "${title}"; + imageBaseURL = baseURL + "${image}"; + } + + /** A string-based constructor to be used with an {@link ObjectParser}. + * + * @see #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean) + */ + public NamespacedWikipediaDocumentSequence( final String file, final String bzipped, final String baseURL, final String parseText ) { + this( file, Boolean.parseBoolean( bzipped ), baseURL, Boolean.parseBoolean( parseText ) ); + } + + /** A string-based constructor to be used with an {@link ObjectParser}. + * + * @see #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean) + */ + public NamespacedWikipediaDocumentSequence( final String file, final String bzipped, final String baseURL, final String parseText, final String keepNamespaced ) { + this( file, Boolean.parseBoolean( bzipped ), baseURL, Boolean.parseBoolean( parseText ), Boolean.parseBoolean( keepNamespaced ) ); + } + + /** A string-based constructor to be used with an {@link ObjectParser}. + * + * @see #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean, int, int, int) + */ + public NamespacedWikipediaDocumentSequence( final String file, final String bzipped, final String baseURL, final String parseText, final String keepNamespaced, final String maxBeforeAnchor, final String maxAnchor, final String maxPostAnchor ) { + this( file, Boolean.parseBoolean( bzipped ), baseURL, Boolean.parseBoolean( parseText ), Boolean.parseBoolean( keepNamespaced ), Integer.parseInt( maxBeforeAnchor ), Integer.parseInt( maxAnchor ), Integer.parseInt( maxPostAnchor ) ); + } + + private static final class DocumentAndFactory { + public final Document document; + public final DocumentFactory factory; + + public DocumentAndFactory( final Document document, final DocumentFactory documentFactory ) { + this.document = document; + this.factory = documentFactory; + } + } + + public boolean isATrueNamespace(final String stringBeforeColumn) { + return nameSpaces.contains( stringBeforeColumn.toLowerCase() ); + } + + public boolean isATrueNamespace(final MutableString stringBeforeColumn) { + return nameSpaces.contains( stringBeforeColumn.toLowerCase() ); + } + + @Override + public DocumentIterator iterator() throws IOException { + final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); + saxParserFactory.setNamespaceAware( true ); + final MutableString nameSpaceAccumulator = new MutableString(); + final ObjectOpenHashSet nameSpacesAccumulator = new ObjectOpenHashSet(); + final ArrayBlockingQueue freeFactories = new ArrayBlockingQueue( 16 ); + for( int i = freeFactories.remainingCapacity(); i-- != 0; ) freeFactories.add( this.factory.copy() ); + final ArrayBlockingQueue readyDocumentsAndFactories = new ArrayBlockingQueue( freeFactories.size() ); + + final SAXParser parser; + try { + parser = saxParserFactory.newSAXParser(); + } + catch ( Exception e ) { + throw new RuntimeException( e.getMessage(), e ); + } + final DefaultHandler handler = new DefaultHandler() { + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); + private boolean inText; + private boolean inTitle; + private boolean inId; + private boolean inTimestamp; + private boolean inNamespaceDef; + private boolean redirect; + private MutableString text = new MutableString(); + private MutableString title = new MutableString(); + private MutableString id = new MutableString(); + private MutableString timestamp = new MutableString(); + private final Reference2ObjectMap, Object> metadata = new Reference2ObjectOpenHashMap, Object>(); + { + metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8" ); + metadata.put( MetadataKeys.REDIRECT, redirectAnchors ); + } + + @Override + public void startElement( String uri, String localName, String qName, Attributes attributes ) throws SAXException { + if ( "page".equals( localName ) ) { + redirect = inText = inTitle = inId = inTimestamp = false; + text.length( 0 ); + title.length( 0 ); + id.length( 0 ); + timestamp.length( 0 ); + } + else if ( "text".equals( localName ) ) inText = true; + else if ( "title".equals( localName ) && title.length() == 0 ) inTitle = true; // We catch only the first id/title elements. + else if ( "id".equals( localName ) && id.length() ==0 ) inId = true; + else if ( "timestamp".equals( localName ) && timestamp.length() ==0 ) inTimestamp = true; + else if ( "redirect".equals( localName ) ) { + redirect = true; + if ( attributes.getValue( "title" ) != null ) + // Accumulate the title of the page as virtual text of the redirect page. + synchronized ( redirectAnchors ) { + final String link = Encoder.encodeTitleToUrl( attributes.getValue( "title" ), true ); + redirectAnchors.add( new AnchorExtractor.Anchor( new MutableString( baseURL.length() + link.length() ).append( baseURL ).append( link ), title.copy() ) ); + } + } + else if ( "namespace".equals( localName ) ) { + // Found a new namespace + inNamespaceDef = true; + nameSpaceAccumulator.length( 0 ); + } + } + + @Override + public void endElement( String uri, String localName, String qName ) throws SAXException { + if ( "namespace".equals( localName ) ) { // Collecting a namespace + if ( nameSpaceAccumulator.length() != 0 ) nameSpacesAccumulator.add( nameSpaceAccumulator.copy().toLowerCase() ); + return; + } + + if ( "namespaces".equals( localName ) ) { // All namespaces collected + nameSpaces = ImmutableSet.copyOf( nameSpacesAccumulator ); + return; + } + + if ( ! redirect ) { + if ( "title".equals( localName ) ) { + // Set basic metadata for the page + metadata.put( PropertyBasedDocumentFactory.MetadataKeys.TITLE, title.copy() ); + String link = Encoder.encodeTitleToUrl( title.toString(), true ); + metadata.put( PropertyBasedDocumentFactory.MetadataKeys.URI, new MutableString( baseURL.length() + link.length() ).append( baseURL ).append( link ) ); + inTitle = false; + } + else if ( "id".equals( localName ) ) { + metadata.put( MetadataKeys.ID, Long.valueOf( id.toString() ) ); + inId = false; + } + else if ( "timestamp".equals( localName ) ) { + try { + metadata.put( MetadataKeys.LASTEDIT, dateFormat.parse( timestamp.toString() ) ); + } + catch ( ParseException e ) { + throw new RuntimeException( e.getMessage(), e ); + } + inTimestamp = false; + } + else if ( "text".equals( localName ) ) { + inText = false; + if ( ! keepNamespaced ) { + // Namespaces are case-insensitive and language-dependent + final int pos = title.indexOf( ':' ); + if ( pos != -1 && isATrueNamespace(title.substring( 0, pos )) ) return; + } + try { + final MutableString html = new MutableString(); + DocumentFactory freeFactory; + try { + freeFactory = freeFactories.take(); + } + catch ( InterruptedException e ) { + throw new RuntimeException( e.getMessage(), e ); + } + if ( parseText ) { + if ( DISAMBIGUATION.search( text ) != -1 ) { // It's a disambiguation page. + /* Roi's hack: duplicate links using the page title, so the generic name will end up as anchor text. */ + final MutableString newLinks = new MutableString(); + for( int start = 0, end; ( start = BRACKETS_OPEN.search( text, start ) ) != -1; start = end ) { + end = start; + final int endOfLink = text.indexOfAnyOf( END_OF_DISAMBIGUATION_LINK, start ); + // Note that we don't escape title because we are working at the Wikipedia raw text level. + if ( endOfLink != -1 ) { + newLinks.append( text.array(), start, endOfLink - start ).append( '|' ).append( title ).append( "]]\n" ); + end = endOfLink; + } + end++; + } + + text.append( newLinks ); + } + // We separate categories by OXOXO, so we don't get overflowing phrases. + final MutableString category = new MutableString(); + for( int start = 0, end; ( start = CATEGORY_START.search( text, start ) ) != -1; start = end ) { + end = BRACKETS_CLOSED.search( text, start += CATEGORY_START.length() ); + if ( end != -1 ) category.append( text.subSequence( start, end ) ).append( " OXOXO " ); + else break; + } + metadata.put( MetadataKeys.CATEGORY, category ); + + // Heuristics to get the first paragraph + metadata.put( MetadataKeys.FIRSTPAR, new MutableString() ); + String plainText = new WikiModel( imageBaseURL, linkBaseURL ).render( new PlainTextConverter( true ), text.toString() ); + for( int start = 0; start < plainText.length(); start++ ) { + //System.err.println("Examining " + plainText.charAt( start ) ); + if ( Character.isWhitespace( plainText.charAt( start ) ) ) continue; + if ( plainText.charAt( start ) == '{' ) { + //System.err.print( "Braces " + start + " text: \"" + plainText.subSequence( start, start + 10 ) + "\" -> " ); + start = BRACES_CLOSED.search( plainText, start ); + //System.err.println( start + " text: \"" + plainText.subSequence( start, start + 10 ) + "\"" ); + if ( start == -1 ) break; + start++; + } + else if ( plainText.charAt( start ) == '[' ) { + start = BRACKETS_CLOSED.search( plainText, start ); + if ( start == -1 ) break; + start++; + } + else { + final int end = plainText.indexOf( '\n', start ); + if ( end != -1 ) metadata.put( MetadataKeys.FIRSTPAR, new MutableString( plainText.substring( start, end ) ) );//new MutableString( new WikiModel( imageBaseURL, linkBaseURL ).render( new PlainTextConverter( true ), text.substring( start, end ).toString() ) ) ); + break; + } + } + + try { + WikiModel wikiModel = new WikiModel( imageBaseURL, linkBaseURL ); + wikiModel.render( new HTMLConverter(), text.toString(), html, false, false ); + final Map categories = wikiModel.getCategories(); + // Put back category links in the page (they have been parsed by bliki and to not appear anymore in the HTML rendering) + for( Entry entry: categories.entrySet() ) { + final String key = entry.getKey(); + final String value = entry.getValue().trim(); + if ( value.length() != 0 ) // There are empty such things + html.append( "\n" ).append( HtmlEscapers.htmlEscaper().escape( key ) ).append( "\n" ); + } + } + catch( Exception e ) { + LOGGER.error( "Unexpected exception while parsing " + title, e ); + } + } + readyDocumentsAndFactories.put( new DocumentAndFactory( freeFactory.getDocument( IOUtils.toInputStream( html, Charsets.UTF_8 ), new Reference2ObjectOpenHashMap, Object>( metadata ) ), freeFactory ) ); + } + catch ( InterruptedException e ) { + throw new RuntimeException( e.getMessage(), e ); + } + catch ( IOException e ) { + throw new RuntimeException( e.getMessage(), e ); + } + } + } + } + + @Override + public void characters( char[] ch, int start, int length ) throws SAXException { + if ( inText && parseText ) text.append( ch, start, length ); + if ( inTitle ) title.append( ch, start, length ); + if ( inId ) id.append( ch, start, length ); + if ( inTimestamp ) timestamp.append( ch, start, length ); + if ( inNamespaceDef ) { + nameSpaceAccumulator.append( ch, start, length ); + inNamespaceDef = false; // Dirty, but it works + } + } + + @Override + public void ignorableWhitespace( char[] ch, int start, int length ) throws SAXException { + if ( inText && parseText ) text.append( ch, start, length ); + if ( inTitle ) title.append( ch, start, length ); + } + }; + + final Thread parsingThread = new Thread() { + public void run() { + try { + InputStream in = new FileInputStream( wikipediaXmlDump ); + if ( bzipped ) in = new BZip2CompressorInputStream( in ); + parser.parse( new InputSource( new InputStreamReader( new FastBufferedInputStream( in ), Charsets.UTF_8 ) ), handler ); + readyDocumentsAndFactories.put( END ); + } + catch ( Exception e ) { + throw new RuntimeException( e.getMessage(), e ); + } + } + }; + + parsingThread.start(); + + return new AbstractDocumentIterator() { + private DocumentFactory lastFactory; + @Override + public Document nextDocument() throws IOException { + try { + final DocumentAndFactory documentAndFactory = readyDocumentsAndFactories.take(); + if ( lastFactory != null ) freeFactories.put( lastFactory ); + if ( documentAndFactory == END ) return null; + lastFactory = documentAndFactory.factory; + return documentAndFactory.document; + } + catch ( InterruptedException e ) { + throw new RuntimeException( e.getMessage(), e ); + } + } + }; + } + + @Override + public DocumentFactory factory() { + return factory; + } + + /** A wrapper around a signed function that remaps entries exceeding a provided threshold using a specified target array. */ + public static final class SignedRedirectedStringMap extends AbstractObject2LongFunction implements StringMap { + private static final long serialVersionUID = 1L; + /** The number of documents. */ + private final long numberOfDocuments; + /** A signed function function mapping valid keys to their ordinal position. */ + private Object2LongFunction signedFunction; + /** The value to be returned for keys whose ordinal position is greater than {@link #numberOfDocuments}. */ + private final long[] target; + + /** Creates a new signed redirected map. + * + * @param numberOfDocuments the threshold after which the {@code target} array will be used to compute the output. + * @param signedFunction the base signed function. + * @param target an array providing the output for items beyond {@code numberOfDocuments}; it must be + * long as the size of {@code signedFunction} minus {@code numberOfDocuments}. + */ + public SignedRedirectedStringMap( final long numberOfDocuments, final Object2LongFunction signedFunction, final long[] target ) { + this.numberOfDocuments = numberOfDocuments; + this.signedFunction = signedFunction; + this.target = target; + } + + @Override + public long getLong( Object key ) { + final long index = signedFunction.getLong( key ); + if ( index == -1 ) return -1; + if ( index < numberOfDocuments ) return index; + return target[ (int)( index - numberOfDocuments ) ]; + } + + @Override + public boolean containsKey( Object key ) { + return signedFunction.getLong( key ) != -1; + } + + public long size64() { + return numberOfDocuments; + } + + @Override + @Deprecated + public int size() { + return (int)Math.min( Integer.MAX_VALUE, size64() ); + } + + @Override + public ObjectBigList list() { + return null; + } + } + + + public static void main( final String arg[] ) throws ParserConfigurationException, SAXException, IOException, JSAPException, ClassNotFoundException { + SimpleJSAP jsap = new SimpleJSAP( NamespacedWikipediaDocumentSequence.class.getName(), "Computes the redirects of a Wikipedia dump and integrate them into an existing virtual document resolver for the dump.", + new Parameter[] { + new Switch( "bzip2", 'b', "bzip2", "The file is compressed with bzip2" ), + new Switch( "iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)." ), + new FlaggedOption( "width", JSAP.INTEGER_PARSER, Integer.toString( Long.SIZE ), JSAP.NOT_REQUIRED, 'w', "width", "The width, in bits, of the signatures used to sign the function from URIs to their rank." ), + new UnflaggedOption( "file", JSAP.STRING_PARSER, JSAP.REQUIRED, "The file containing the Wikipedia dump." ), + new UnflaggedOption( "baseURL", JSAP.STRING_PARSER, JSAP.REQUIRED, "The base URL for the collection (e.g., http://en.wikipedia.org/wiki/)." ), + new UnflaggedOption( "uris", JSAP.STRING_PARSER, JSAP.REQUIRED, "The URIs of the documents in the collection (generated by ScanMetadata)." ), + new UnflaggedOption( "vdr", JSAP.STRING_PARSER, JSAP.REQUIRED, "The name of a precomputed virtual document resolver for the collection." ), + new UnflaggedOption( "redvdr", JSAP.STRING_PARSER, JSAP.REQUIRED, "The name of the resulting virtual document resolver." ) + }); + + JSAPResult jsapResult = jsap.parse( arg ); + if ( jsap.messagePrinted() ) return; + + final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); + saxParserFactory.setNamespaceAware( true ); + final Object2ObjectOpenHashMap redirects = new Object2ObjectOpenHashMap(); + final String baseURL = jsapResult.getString( "baseURL" ); + final ProgressLogger progressLogger = new ProgressLogger( LOGGER ); + progressLogger.itemsName = "redirects"; + progressLogger.start( "Extracting redirects..." ); + + final SAXParser parser = saxParserFactory.newSAXParser(); + final DefaultHandler handler = new DefaultHandler() { + private boolean inTitle; + private MutableString title = new MutableString(); + + @Override + public void startElement( String uri, String localName, String qName, Attributes attributes ) throws SAXException { + if ( "page".equals( localName ) ) { + inTitle = false; + title.length( 0 ); + } + else if ( "title".equals( localName ) && title.length() == 0 ) inTitle = true; // We catch only the first title element. + else if ( "redirect".equals( localName ) && attributes.getValue( "title" ) != null ) { + progressLogger.update(); + redirects.put( title.copy(), attributes.getValue( "title" ) ); + } + } + + @Override + public void endElement( String uri, String localName, String qName ) throws SAXException { + if ( "title".equals( localName ) ) inTitle = false; + } + + @Override + public void characters( char[] ch, int start, int length ) throws SAXException { + if ( inTitle ) title.append( ch, start, length ); + } + + @Override + public void ignorableWhitespace( char[] ch, int start, int length ) throws SAXException { + if ( inTitle ) title.append( ch, start, length ); + } + }; + + InputStream in = new FileInputStream( jsapResult.getString( "file" ) ); + if ( jsapResult.userSpecified( "bzip2" ) ) in = new BZip2CompressorInputStream( in ); + parser.parse( new InputSource( new InputStreamReader( new FastBufferedInputStream( in ), Charsets.UTF_8 ) ), handler ); + progressLogger.done(); + + final Object2LongLinkedOpenHashMap resolved = new Object2LongLinkedOpenHashMap(); + final VirtualDocumentResolver vdr = (VirtualDocumentResolver)BinIO.loadObject( jsapResult.getString( "vdr" ) ); + + progressLogger.expectedUpdates = redirects.size(); + progressLogger.start( "Examining redirects..." ); + + for( Map.Entry e: redirects.entrySet() ) { + final MutableString start = new MutableString().append( baseURL ).append( Encoder.encodeTitleToUrl( e.getKey().toString(), true ) ); + final MutableString end = new MutableString().append( baseURL ).append( Encoder.encodeTitleToUrl( e.getValue(), true ) ); + final long s = vdr.resolve( start ); + if ( s == -1 ) { + final long t = vdr.resolve( end ); + if ( t != -1 ) resolved.put( start.copy(), t ); + else LOGGER.warn( "Failed redirect: " + start + " -> " + end ); + } + else LOGGER.warn( "URL " + start + " is already known to the virtual document resolver" ); + + progressLogger.lightUpdate(); + } + + progressLogger.done(); + + //System.err.println(resolved); + + final Iterable allURIs = Iterables.concat( new FileLinesCollection( jsapResult.getString( "uris" ), "UTF-8" ), resolved.keySet() ); + final long numberOfDocuments = vdr.numberOfDocuments(); + + final TransformationStrategy transformationStrategy = jsapResult.userSpecified( "iso" ) + ? TransformationStrategies.iso() + : TransformationStrategies.utf16(); + + BinIO.storeObject( + new URLMPHVirtualDocumentResolver( + new SignedRedirectedStringMap( numberOfDocuments, + new ShiftAddXorSignedStringMap( allURIs.iterator(), new MWHCFunction.Builder().keys( allURIs ).transform( transformationStrategy ).build(), jsapResult.getInt( "width" ) ), + resolved.values().toLongArray() ) ), jsapResult.getString( "redvdr" ) ); + } +} diff --git a/java/src/it/unimi/di/wikipedia/parsing/WikipediaCategoryProducer.java b/java/src/it/unimi/di/wikipedia/parsing/WikipediaCategoryProducer.java new file mode 100644 index 0000000..2a84cfa --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/parsing/WikipediaCategoryProducer.java @@ -0,0 +1,232 @@ +package it.unimi.di.wikipedia.parsing; + +import it.unimi.di.big.mg4j.document.Document; +import it.unimi.di.big.mg4j.document.DocumentIterator; +import it.unimi.di.wikipedia.utils.IntMapGraph; +import it.unimi.di.wikipedia.utils.MapUtils; +import it.unimi.dsi.fastutil.ints.Int2ObjectMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.webgraph.BVGraph; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintStream; +import java.io.Reader; +import java.util.Scanner; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.output.NullOutputStream; +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.martiansoftware.jsap.JSAP; +import com.martiansoftware.jsap.JSAPResult; +import com.martiansoftware.jsap.Parameter; +import com.martiansoftware.jsap.SimpleJSAP; +import com.martiansoftware.jsap.Switch; +import com.martiansoftware.jsap.UnflaggedOption; + +public class WikipediaCategoryProducer { + + public static Logger LOGGER = LoggerFactory.getLogger(WikipediaCategoryProducer.class); + + private static final int CATEGORY_NAME_INDEX = "Category:".length(); + private static final int UNSEEN_CATEGORY = -1; + private static final int CATEGORY_FIELD = 3; + private static final String SEPARATOR_REGEX = "OXOXO"; + private static final int ESITMATED_NUM_OF_PAGES = 6721260; // this is needed only to log progress + + + + + private final NamespacedWikipediaDocumentSequence wikipediaDocumentSequence; + private final Int2ObjectMap pageId2Name; + private final Object2IntMap catName2Id; + private final Int2ObjectMap page2cat; + private final Int2ObjectOpenHashMap cat2cat; + private PrintStream plainUrisFile = new PrintStream(NullOutputStream.NULL_OUTPUT_STREAM); + + private int nextPageId; + private int nextCategoryId; + + WikipediaCategoryProducer(NamespacedWikipediaDocumentSequence wds) { + this.wikipediaDocumentSequence = wds; + catName2Id = new Object2IntOpenHashMap(); + catName2Id.defaultReturnValue(UNSEEN_CATEGORY); + pageId2Name = new Int2ObjectOpenHashMap(); + page2cat = new Int2ObjectOpenHashMap(); + cat2cat = new Int2ObjectOpenHashMap(); + nextPageId = 0; + nextCategoryId = 0; + } + + + private static enum Namespace { + ARTICLE, OTHER, CATEGORY; + } + + private Namespace getNamespace(String title) { + int pos = title.indexOf(':'); + if (pos < 0) return Namespace.ARTICLE; + String namespace = title.substring(0, pos); + return (namespace.toLowerCase().equals("category")) ? + Namespace.CATEGORY + : ( + (wikipediaDocumentSequence.isATrueNamespace(namespace)) ? + Namespace.OTHER + : Namespace.ARTICLE + ); + } + + private int getCategoryId(String category) { + int categoryId = catName2Id.getInt(category); + if (categoryId == UNSEEN_CATEGORY) { + categoryId = this.nextCategoryId++; + catName2Id.put(category, categoryId); + } + return categoryId; + } + + private IntSet parseCategories(Document wikiPage) throws IOException { + String categoryString = IOUtils.toString((Reader) wikiPage.content(CATEGORY_FIELD)); + IntSet categoryIds = new IntOpenHashSet(); + int pipeIndex; + + for (String category : categoryString.split(SEPARATOR_REGEX)) { + if ((pipeIndex = category.indexOf('|')) > -1) + category = category.substring(0, pipeIndex); + + category = StringUtils.strip(category); + if (category.length() > 0) + categoryIds.add(getCategoryId(category)); + } + + return categoryIds; + } + + @SuppressWarnings("resource") // the warning on wikiPage is false. + public void extractAllData() throws IOException { + DocumentIterator wikiPagesIterator = wikipediaDocumentSequence.iterator(); + Document wikiPage; + String title; + + ProgressLogger pl = new ProgressLogger(LOGGER, "pages"); + pl.expectedUpdates = ESITMATED_NUM_OF_PAGES; + pl.info = new Object() { + public String toString() {return catName2Id.size() + " categories found.";} + }; + pl.start("Starting to iterate all the pages in the XML file..."); + + // iterating pages + while ((wikiPage = wikiPagesIterator.nextDocument()) != null) { + switch (getNamespace(title = wikiPage.title().toString())) { + case CATEGORY: + cat2cat.put( + getCategoryId(title.substring(CATEGORY_NAME_INDEX)), + parseCategories(wikiPage) + ); + break; + case ARTICLE: + int pageId = nextPageId++; + pageId2Name.put(pageId, title); + + page2cat.put(pageId, + parseCategories(wikiPage) + ); + plainUrisFile.println(wikiPage.uri()); + + break; + default: + break; + } + + wikiPage.close(); + pl.update(); + + } + pl.done(); + + wikipediaDocumentSequence.close(); + wikiPagesIterator.close(); + } + + private void setPlainUrisFile(String path) throws FileNotFoundException { + this.plainUrisFile = new PrintStream(new File(path)); + } + + private void saveAllTo(String basename) throws Exception { + try { + BinIO.storeObject(pageId2Name, basename + "pageId2Name.ser"); + BinIO.storeObject(catName2Id, basename + "catName2Id.ser"); + BinIO.storeObject(page2cat, basename + "page2cat.ser"); + + if (cat2cat.isEmpty()) LOGGER.error("THE PARSING DID NOT FIND ANY CATEGORY PSEUDOTREE"); + else BVGraph.store(new IntMapGraph(cat2cat), "categoryPseudotree"); + + BinIO.storeObject(MapUtils.invert(catName2Id), basename + "catId2Name.ser"); + } catch (IOException e) { + LOGGER.error("Cannot save something :( :(", e); + } + this.plainUrisFile.close(); + } + + static String askForString(String s) { + System.out.println(s); + Scanner scanner = new Scanner(System.in); + String nextLine = scanner.nextLine(); + scanner.close(); + return nextLine; + } + + public static void main(String[] rawArguments) throws Exception { + + SimpleJSAP jsap = new SimpleJSAP( + WikipediaCategoryProducer.class.getName(), + "Read a wikipedia dump and produces these files as " + + "serialized Java objects: \n" + + " * pageId2Name.ser, an Int2ObjectMap from page ids to " + + "wikipedia page names \n" + + " * catId2Name.ser, an Object2IntMap from category ids to " + + "category names \n" + + " * page2cat.ser, an Int2ObjectMap from page ids to an IntSet" + + "of category ids \n" + + " * categoryPseudotree.graph, the Wikipedia Category Hierarchy " + + "in BVGraph format", + new Parameter[] { + new UnflaggedOption( "input", JSAP.STRING_PARSER, JSAP.REQUIRED, + "The pages-articles.xml input file, from Wikipedia." ), + new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, + "The basename of the output files (p.e. a Directory with / in the end)" ), + new Switch("bzip", 'z', "bzip", "Interpret the input file as bzipped"), + new Switch("verbose", 'v', "verbose", "Print every category found to StdErr") + }); + + // Initializing input read + JSAPResult args = jsap.parse( rawArguments ); + if ( jsap.messagePrinted() ) System.exit( 1 ); + + NamespacedWikipediaDocumentSequence wikipediaDocumentSequence = new NamespacedWikipediaDocumentSequence( + args.getString("input"), + args.getBoolean("bzip"), + "http://en.wikipedia.org/wiki/", true, + true // keep all namespaces + ); + WikipediaCategoryProducer reader = + new WikipediaCategoryProducer(wikipediaDocumentSequence); + reader.setPlainUrisFile(args.getString("basename") + "pages.uris"); + reader.extractAllData(); + reader.saveAllTo(args.getString("basename")); + wikipediaDocumentSequence.close(); + + } + + +} diff --git a/java/src/it/unimi/di/wikipedia/utils/IntMapGraph.java b/java/src/it/unimi/di/wikipedia/utils/IntMapGraph.java new file mode 100644 index 0000000..7956105 --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/utils/IntMapGraph.java @@ -0,0 +1,138 @@ +package it.unimi.di.wikipedia.utils; + +import it.unimi.dsi.fastutil.ints.Int2ObjectMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectMap.Entry; +import it.unimi.dsi.fastutil.ints.IntArrays; +import it.unimi.dsi.fastutil.ints.IntSet; +import it.unimi.dsi.fastutil.ints.IntSets; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.webgraph.BVGraph; +import it.unimi.dsi.webgraph.ImmutableGraph; +import it.unimi.dsi.webgraph.LazyIntIterator; +import it.unimi.dsi.webgraph.LazyIntIterators; +import it.unimi.dsi.webgraph.ScatteredArcsASCIIGraph; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.martiansoftware.jsap.FlaggedOption; +import com.martiansoftware.jsap.JSAP; +import com.martiansoftware.jsap.JSAPException; +import com.martiansoftware.jsap.JSAPResult; +import com.martiansoftware.jsap.Parameter; +import com.martiansoftware.jsap.SimpleJSAP; +import com.martiansoftware.jsap.UnflaggedOption; + +public class IntMapGraph extends ImmutableGraph { + public static Logger LOGGER = LoggerFactory.getLogger(IntMapGraph.class); + + public final Int2ObjectMap map; + private final int numNodes, numArcs; + + public IntMapGraph(Int2ObjectMap map) { + this.map = map; + if (map.defaultReturnValue() == null || !map.defaultReturnValue().equals(IntSets.EMPTY_SET)) { + LOGGER.warn("It is necessary to set default return value of the map as the empty set."); + map.defaultReturnValue(IntSets.EMPTY_SET); + } + + int maxNodeIndex = 0, numArcs = 0; + for (Entry x : map.int2ObjectEntrySet()) { + if (x.getIntKey() > maxNodeIndex) + maxNodeIndex = x.getIntKey(); + for (int succ : x.getValue()) { + if (succ > maxNodeIndex) + maxNodeIndex = succ; + numArcs++; + } + } + + this.numArcs = numArcs; + this.numNodes = maxNodeIndex+1; + } + + @Override + public int numNodes() { + return numNodes; + } + + @Override + public boolean randomAccess() { + return true; + } + + @Override + public int outdegree(int x) { + return map.get(x).size(); + } + + @Override + public long numArcs() { + return numArcs; + } + + @Override + public int[] successorArray( final int x ) { + int[] succ = map.get(x).toIntArray(); + IntArrays.quickSort(succ); + return succ; + } + + @Override + public LazyIntIterator successors( final int x ) { + return LazyIntIterators.wrap( successorArray(x) ); + } + + + @Override + public ImmutableGraph copy() { + throw new UnsupportedOperationException(); + } + + @SuppressWarnings("unchecked") + public static void main( String args[] ) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { + String basename; + SimpleJSAP jsap = new SimpleJSAP( ScatteredArcsASCIIGraph.class.getName(), "Converts a int2intset fastutil map into a BVGraph.", + new Parameter[] { + new UnflaggedOption( "map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The serialized Int2ObjectMap" ), + new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ), + new FlaggedOption( "comp", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag (may be specified several times)." ).setAllowMultipleDeclarations( true ), + new FlaggedOption( "windowSize", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_WINDOW_SIZE ), JSAP.NOT_REQUIRED, 'w', "window-size", "Reference window size (0 to disable)." ), + new FlaggedOption( "maxRefCount", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_MAX_REF_COUNT ), JSAP.NOT_REQUIRED, 'm', "max-ref-count", "Maximum number of backward references (-1 for ∞)." ), + new FlaggedOption( "minIntervalLength", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_MIN_INTERVAL_LENGTH ), JSAP.NOT_REQUIRED, 'i', "min-interval-length", "Minimum length of an interval (0 to disable)." ), + new FlaggedOption( "zetaK", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_ZETA_K ), JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes." ), + new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the output graph" ), + } + ); + + JSAPResult jsapResult = jsap.parse( args ); + if ( jsap.messagePrinted() ) System.exit( 1 ); + + basename = jsapResult.getString( "basename" ); + + int flags = 0; + for( String compressionFlag: jsapResult.getStringArray( "comp" ) ) { + try { + flags |= BVGraph.class.getField( compressionFlag ).getInt( BVGraph.class ); + } + catch ( Exception notFound ) { + throw new JSAPException( "Compression method " + compressionFlag + " unknown." ); + } + } + + final int windowSize = jsapResult.getInt( "windowSize" ); + final int zetaK = jsapResult.getInt( "zetaK" ); + int maxRefCount = jsapResult.getInt( "maxRefCount" ); + if ( maxRefCount == -1 ) maxRefCount = Integer.MAX_VALUE; + final int minIntervalLength = jsapResult.getInt( "minIntervalLength" ); + + final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), TimeUnit.MILLISECONDS ); + ImmutableGraph graph = new IntMapGraph((Int2ObjectMap) BinIO.loadObject(jsapResult.getString("map"))); + BVGraph.store( graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl ); + } + +} diff --git a/java/src/it/unimi/di/wikipedia/utils/MapUtils.java b/java/src/it/unimi/di/wikipedia/utils/MapUtils.java new file mode 100644 index 0000000..5ebbf29 --- /dev/null +++ b/java/src/it/unimi/di/wikipedia/utils/MapUtils.java @@ -0,0 +1,133 @@ +package it.unimi.di.wikipedia.utils; + +import it.unimi.dsi.fastutil.ints.AbstractInt2ObjectFunction; +import it.unimi.dsi.fastutil.ints.Int2DoubleMap; +import it.unimi.dsi.fastutil.ints.Int2ObjectFunction; +import it.unimi.dsi.fastutil.ints.IntComparator; +import it.unimi.dsi.fastutil.objects.AbstractObject2IntFunction; +import it.unimi.dsi.fastutil.objects.Object2IntFunction; +import it.unimi.dsi.logging.ProgressLogger; + +import java.lang.reflect.InvocationTargetException; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class MapUtils { + public static Logger LOGGER = LoggerFactory.getLogger(MapUtils.class); + + final public static Int2ObjectFunction NUMBER_PRINTER = new AbstractInt2ObjectFunction(){ + private static final long serialVersionUID = 1L; + + @Override + public String get(int key) { + return Integer.toString(key); + } + + @Override + public boolean containsKey(int key) { + return true; + } + + @Override + public int size() { + return Integer.MAX_VALUE; + } + + }; + + final public static Object2IntFunction NUMBER_READER = new AbstractObject2IntFunction(){ + private static final long serialVersionUID = 1L; + + @Override + public int getInt(Object key) { + if (! (key instanceof String)) + return defRetValue; + try { + return Integer.parseInt((String) key); + } catch (NumberFormatException e) { + return defRetValue; + } + } + + @Override + public boolean containsKey(Object key) { + if (! (key instanceof String)) + return false; + else { + try { + Integer.parseInt((String) key); + return true; + } catch (NumberFormatException e) { + return false; + } + } + } + + @Override + public int size() { + return Integer.MAX_VALUE; + } + + + }; + + public static Class invertMapType(Class cls) throws ClassNotFoundException { + String[] mapType = StringUtils.splitByCharacterTypeCamelCase(cls.getSimpleName()); + String type1 = mapType[0]; + if (!mapType[1].equals("2")) + throw new IllegalArgumentException(cls + " is not a fastutil map."); + String type2 = mapType[2]; + mapType[0] = type2; + mapType[2] = type1; + String newType = StringUtils.join(mapType); + newType = "it.unimi.dsi.fastutil." + type2.toLowerCase() + "s." + newType; + return Class.forName(newType); + } + + @SuppressWarnings({ "rawtypes", "unchecked" }) + public static Map invert(Map inputMap) throws InstantiationException, + IllegalAccessException, InvocationTargetException, + NoSuchMethodException, ClassNotFoundException { + LOGGER.info("Inverting map..."); + Map outputMap = (Map) invertMapType(inputMap.getClass()).getConstructor(new Class[] {}).newInstance(new Object[] {}); + + ProgressLogger pl = new ProgressLogger(LOGGER, "entries"); + pl.expectedUpdates = inputMap.size(); + pl.start(); + + for (Object entryObj : inputMap.entrySet()) { + Map.Entry entry = (Map.Entry) entryObj; + Object oldValue = outputMap.put(entry.getValue(), entry.getKey()); + if (oldValue != null) + throw new IllegalArgumentException( + "The value " + entry.getValue() + " is associated to both '" + + oldValue + "' and '" + entry.getKey() + "'. The map is not" + + "bijective" + ); + pl.lightUpdate(); + } + pl.done(); + return outputMap; + } + + public static IntComparator comparatorPuttingLargestMappedValueFirst(final Int2DoubleMap map) { + return new IntComparator() { + public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); } + public int compare(int k1, int k2) { + return Double.compare(map.get(k2), map.get(k1)); + } + }; + } + + public static IntComparator comparatorPuttingSmallestMappedValueFirst(final Int2DoubleMap map) { + return new IntComparator() { + public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); } + public int compare(int k1, int k2) { + return Double.compare(map.get(k1), map.get(k2)); + } + }; + } +} diff --git a/page2cat-HEAD.tsv b/page2cat-HEAD.tsv new file mode 100644 index 0000000..07df109 --- /dev/null +++ b/page2cat-HEAD.tsv @@ -0,0 +1,50 @@ +Anarchism Political ideologies Social theories Political culture Anti-capitalism Far-left politics Anarchism +Değnek Villages by country Regions of Turkey Populated places in Turkey by province +Queensland Conservatorium Griffith University Universities by country Entertainment in Australia Australian capital cities +Octagon Chapel, Liverpool Churches Buildings and structures in England by city +Radha Mohan People by status People by ethnicity People by ethnicity and occupation Indian people Film directors +Thomas Craig (jurist) Scottish society Alumni by university or college in Europe Poetry by nation or language Scottish people by occupation +Microsorum vieillardii Plants +Princess Augusta Sophia of the United Kingdom British monarchy European royal families +Derek Thompson (baseball) People by status Minor league baseball players by team Baseball players by team Baseball in the United States by state Major League Baseball +Battle of the Coral Sea (film) History of the United States Entertainment in the United States Military personnel Works by type and year Companies based in Los Angeles County, California +Carvone Qualia Organic compounds Hydrocarbons +Red-tailed monkey Catarrhini Animals Vertebrates by country +2003 MAC Men's Basketball Tournament Sports in the United States by city College men's basketball seasons in the United States +Louis Franchet d'Espèrey French people People by country and city Military personnel by war People by region in France Military history of France +Benjamin Tuke Irish sportspeople Rugby union teams +Cornelis de Bie Contents European writers People by country and city Dead people Alumni by university or college in Europe Poets by nationality Belgian people by occupation +Washington State Legislature State governments of the United States State government in the United States Bicameral legislatures +Đàn đá Natural materials Asian music Hornbostel-Sachs +Issa (Senegalese singer) Life People by status Alumni by university or college in the United States by state People of African descent Capitals in Africa +William Johnston Tupper Schools Alumni by university or college in the United States Canadian people by occupation Heads of state of former countries Canadian people by ethnic or national origin Nova Scotia Provincial and territorial capitals of Canada +Saint Vincent Academy Roman Catholic Church in the United States Counties in the New York metropolitan area High schools and secondary schools +Bonkers (song) Songs by artist Industry in the United Kingdom Works by decade +Principal axis +Thomas Jones (historian) History of the United States by state People from New York People by city or town in England Local government in New York English colonization of the Americas Wars of independence +Show Us Your Tiddas! Plays by nationality +LOA +The Last Days of the Late, Great State of California California California culture Novels by genre +Aladdin (1992 Golden Films film) Folklore 20th century in the United States Arts in the United States Entertainment in the United States Films by genre +Alejandro Ibarra People by status States of Mexico +Banbury Local government in England by county Towns in England by county +Swimming at the 1992 Summer Olympics – Men's 100 metre breaststroke Summer Olympics events by year +Underdown +Dreaming Lips +Escrow Legal documents Jargon Real estate Finance Personal finance +Aída Álvarez People by status History of the United States government Agencies of the United States government American people by occupation People by city in the United States Alumni by university or college in the United States by state American politicians American politicians by state Businesspeople by nationality Journalists by nationality +Mulberry High School +Anaxagoras (disambiguation) +Cretinism Injustice Endocrine, nutritional and metabolic diseases Disability +Nenad Petrović (chess composer) Individual sports Sportspeople by sport and nationality +Construction Clients' Group Building +Gmina Osieczna, Pomeranian Voivodeship Country subdivisions of Europe Land counties of Poland +Mikołaj Gomółka People by city in Poland Classical musicians Polish people by occupation +Academy at Central Cities in the United States by state +The Fourth Legacy Heavy metal albums by genre +Wonderful Shadow Songs by artist Products by company Works by decade +Kayentavenator Megafauna Fossils +Jiang Qin Life Cities in China +Ryukyu flying fox Contents Vertebrates by country Species described in the 19th century Eating behaviors +Casili, Sorsogon +Division of Coolgardie States and territories of Australia Constituencies diff --git a/page2cat.tsv.gz.REMOVED.git-id b/page2cat.tsv.gz.REMOVED.git-id new file mode 100644 index 0000000..3a21b05 --- /dev/null +++ b/page2cat.tsv.gz.REMOVED.git-id @@ -0,0 +1 @@ +e06bd5f41b12cc0b200ad99ac20c29aecff066ba \ No newline at end of file diff --git a/ranked-categories-HEAD.tsv b/ranked-categories-HEAD.tsv new file mode 100644 index 0000000..306d5ea --- /dev/null +++ b/ranked-categories-HEAD.tsv @@ -0,0 +1,20 @@ +Culture 84033.0551327917 +Society 83964.64836795685 +Humanities 81135.57333792261 +Social sciences 79597.60307068429 +Contents 79240.52000298478 +Humans 78310.63217559278 +Nationality 77141.57482468022 +Anthropology 76866.94059820025 +Fields of history 76409.94215997284 +Structure 76149.19697245964 +Academic disciplines 76087.69245474423 +Society by nationality 75270.89088860311 +Cultural spheres of influence 74697.36255461193 +Political philosophy 74500.76841205012 +Subfields by academic discipline 74460.0224443292 +Cultures 74402.63092080808 +Social groups 74076.41203034452 +Places 73634.00505099582 +Sociology of culture 73520.831210539 +Sociology 73343.61321021541 diff --git a/ranked-categories.tsv.gz.REMOVED.git-id b/ranked-categories.tsv.gz.REMOVED.git-id new file mode 100644 index 0000000..094c143 --- /dev/null +++ b/ranked-categories.tsv.gz.REMOVED.git-id @@ -0,0 +1 @@ +54d6da6a5c2f1dee2d12aa4103fc47b87251dd30 \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..e26e0e5 --- /dev/null +++ b/run.sh @@ -0,0 +1,11 @@ +WIKIDUMP_XML=enwiki-20160407-pages-articles.xml.bz2 +N_TOP_CATEGORIES=10000 + +{ java it.unimi.di.wikipedia.categories.CategorySelectionToolchain --help 2>&1 | grep -q "Could not find or load main class"; } && { echo 'Java LlamaFur commands not found. You should compile them and include them in the classpath. Look at the "Compile LlamaFur code" part of readme.md!' ; exit 1; } + +java it.unimi.di.wikipedia.parsing.WikipediaCategoryProducer $WIKIDUMP_XML ./ --bzip + +java it.unimi.di.wikipedia.categories.CategorySelector categoryPseudotree \ + page2cat.ser pageId2Name.ser catId2Name.ser \ + -e "wiki" -e "categories" -e "main topic classifications" -e "template" -e "navigational box" \ + $N_TOP_CATEGORIES ranked-categories.tsv page2cat.tsv