diff --git a/README.md b/README.md
index e79b34a..e5da15d 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,68 @@
-# wikipedia-categories
-Cleansing Wikipedia Categories using Centrality
+# Cleansing Wikipedia Categories using Centrality
+## by Paolo Boldi and Corrado Monti
+
+We propose a novel general technique aimed at pruning and cleansing the Wikipedia category hierarchy, with a tunable level of aggregation. Our approach is endogenous, since it does not use any information coming from Wikipedia articles, but it is based solely on the user-generated (noisy) Wikipedia category folksonomy itself. We show how the proposed techniques can help reduce the level of noise in the hierarchy and discuss how alternative centrality measures can differently impact on the result.
+
+For more information see [the paper, presented at WWW2016 (companion), Wiki Workshop 2016, at Montreal](http://dl.acm.org/ft_gateway.cfm?id=2891111&ftid=1707848).
+
+# Provided dataset
+
+* `page2cat.tsv.gz` is a gzipped TSV file with the mapping from Wikipedia pages to cleansed categories, from the most important to the least important.
+* `ranked-categories.tsv.gz` is a gzipped TSV file with every Wikipedia category and our importance score.
+
+We also provide head of these files to show how they look like after unzip.
+
+If you use the dataset or the code, please cite:
+Boldi, Paolo, and Corrado Monti. "Cleansing wikipedia categories using centrality." Proceedings of the 25th International Conference Companion on World Wide Web. International World Wide Web Conferences Steering Committee, 2016.
+
+Bibtex:
+
+    @inproceedings{boldi2016cleansing,
+    title={Cleansing wikipedia categories using centrality},
+    author={Boldi, Paolo and Monti, Corrado},
+    booktitle={Proceedings of the 25th International Conference Companion on World Wide Web},
+    pages={969--974},
+    year={2016},
+    organization={International World Wide Web Conferences Steering Committee}
+    }
+
+
+PLEASE NOTE: *Experiments described in the paper were run on a 2014 snapshot called
+`enwiki-20140203-pages-articles.xml.bz2`, while – to provide an updated version –
+this dataset refers to `enwiki-20160407-pages-articles.xml.bz2`.*
+
+# How to run code
+
+Set up the environment
+----------------------
+
+In order to compile the code, you'll need Java 8, Ant and Ivy. To install
+them (e.g. inside a clean [Vagrant](http://vagrantup.com/) box with
+`ubuntu/trusty64`), you should use these lines:
+
+    sudo apt-get --yes update
+    sudo apt-get install -y software-properties-common python-software-properties
+    echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | sudo /usr/bin/debconf-set-selections
+    sudo add-apt-repository ppa:webupd8team/java -y
+    sudo apt-get update
+    sudo apt-get --yes install oracle-java8-installer
+    sudo apt-get --yes install oracle-java8-set-default
+    sudo apt-get --yes install ant ivy
+    sudo ln -s -T /usr/share/java/ivy.jar /usr/share/ant/lib/ivy.jar
+
+
+Compile the code
+----------------------
+
+If the environment is set up properly, you should install git and download this repo with
+
+	sudo apt-get install git
+	git clone https://github.com/corradomonti/wikipedia-categories.git
+
+and then go to the directory `java`. There, run:
+
+* `ant ivy-setupjars` to download dependencies
+* `ant` to compile
+* `. setcp.sh` to include the produced jar inside the Java classpath.
+
+Now you are ready to run `run.sh`.
diff --git a/java/build.properties b/java/build.properties
new file mode 100644
index 0000000..2f57d9a
--- /dev/null
+++ b/java/build.properties
@@ -0,0 +1,25 @@
+version=1.0
+
+build.sysclasspath=ignore
+
+jar.base=/usr/share/java
+javadoc.base=/usr/share/javadoc
+
+dist=dist
+src=src
+test=test
+slow=slow
+reports=reports
+coverage=coverage
+checkstyle=checkstyle
+docs=docs
+build=build
+instrumented=instr
+
+j2se.apiurl=http://download.oracle.com/javase/6/docs/api/
+fastutil.apiurl=http://fastutil.dsi.unimi.it/docs/
+jsap.apiurl=http://www.martiansoftware.com/jsap/doc/javadoc/
+junit.apiurl=http://junit.sourceforge.net/javadoc_40/
+log4j.apiurl=http://logging.apache.org/log4j/1.2/apidocs/
+slf4j.apiurl=http://www.slf4j.org/apidocs/
+webgraph.apiurl=http://webgraph.dsi.unimi.it/docs/
\ No newline at end of file
diff --git a/java/build.xml b/java/build.xml
new file mode 100644
index 0000000..23dd2ce
--- /dev/null
+++ b/java/build.xml
@@ -0,0 +1,326 @@
+<project name="wikicategories" default="jar" basedir="." xmlns:ivy="antlib:org.apache.ivy.ant" xmlns:artifact="antlib:org.apache.maven.artifact.ant">
+
+	<property name="build.sysclasspath" value="ignore"/>
+	<property name="jars.dir" value="${basedir}/jars"/>
+	<property file="build.properties"/>
+
+	<property environment="env"/>
+
+	<!-- <property name="ivy.pom.version" value="${version}" />  this is not really needed and maks the snapshot target fail -->
+	<condition property="ivy.settings.file" value="${env.LOCAL_IVY_SETTINGS}"><isset property="env.LOCAL_IVY_SETTINGS"/></condition>
+
+	<taskdef resource="org/apache/ivy/ant/antlib.xml" uri="antlib:org.apache.ivy.ant"/>
+
+	<target name="ivy-setupjars" description="Downloads dependencies with ivy and generate report">
+		<ivy:retrieve symlink="true" sync="true" pattern="${jars.dir}/[conf]/[artifact].[ext]"/>
+		<ivy:report todir="${dist}/ivy-report"/>
+	</target>
+
+	<target name="ivy-clean" description="Cleans ivy cache, jars dir and ivy installation">
+		<delete dir="${jars.dir}"/>
+	</target>
+
+	<target name="ivy-pom" description="Creates POM">
+		<ivy:resolve/>
+		<ivy:deliver deliverpattern="${dist}/ivy.xml" pubrevision="${version}" status="release"/>
+		<ivy:makepom ivyfile="${dist}/ivy.xml" templatefile="pom-model.xml" pomfile="pom.xml"/>
+	</target>
+
+	<path id="compile.classpath">
+		<fileset dir="${jars.dir}/compile"/>
+	</path>
+	<path id="test.classpath">
+		<fileset dir="${jars.dir}/test"/>
+	</path>
+	<path id="project.classpath">
+		<fileset dir="${jars.dir}/runtime"/>
+	</path>
+
+
+	<!-- ************************************** WARNING: MAVEN SH*T ************************************** -->
+
+	<!-- define Maven coordinates -->
+	<property name="groupId" value="it.unimi.dsi" />
+	<property name="artifactId" value="wikicategories" />
+	<property name="version" value="${version}" />
+
+	<!-- define artifacts' name, which follows the convention of Maven -->
+	<property name="maven-jar" value="${dist}/lib/${artifactId}-${version}.jar" />
+	<property name="maven-javadoc-jar" value="${dist}/lib/${artifactId}-${version}-javadoc.jar" />
+	<property name="maven-sources-jar" value="${dist}/lib/${artifactId}-${version}-sources.jar" />
+
+	<!-- defined maven snapshots and staging repository id and url -->
+	<property name="maven-snapshots-repository-id" value="sonatype-nexus-snapshots" />
+	<property name="maven-snapshots-repository-url" value="https://oss.sonatype.org/content/repositories/snapshots/" />
+	<property name="maven-staging-repository-id" value="sonatype-nexus-staging" />
+	<property name="maven-staging-repository-url" value="https://oss.sonatype.org/service/local/staging/deploy/maven2/" />
+
+	<target name="dist" depends="compile,javadoc" description="generate the distribution">
+
+		<!-- build the main artifact -->
+		<jar jarfile="${maven-jar}">
+			<fileset dir="${build}"/>
+			<fileset dir="${src}" includes="**/*.8"/>
+			<fileset dir="${src}" includes="**/*.16"/>
+			<fileset dir="${src}" includes="**/*.12"/>
+		</jar>
+
+		<!-- build the javadoc artifact (from symbolic link created in init) -->
+		<jar jarfile="${maven-javadoc-jar}">
+			<fileset dir="${dist}/javadoc" />
+		</jar>
+
+		<!-- build the sources artifact -->
+		<jar jarfile="${maven-sources-jar}">
+			<fileset dir="." includes="CHANGES,COPYING,COPYING.LESSER,JavaBig.pdf,build.xml,build.properties,ivy.xml,${src}/**/*.java,${src}/**/*.8,${src}/**/*.12,${src}/**/*.16,${src}/**/*.html,${test}/**/*.java,${slow}/**/*.java"/>
+		</jar>
+	</target>
+
+	<target name="deploy" depends="dist,ivy-pom" description="deploy snapshot version to Maven snapshot repository">
+		<artifact:mvn>
+			<arg value="org.apache.maven.plugins:maven-deploy-plugin:2.6:deploy-file" />
+			<arg value="-Durl=${maven-snapshots-repository-url}" />
+			<arg value="-DrepositoryId=${maven-snapshots-repository-id}" />
+			<arg value="-DpomFile=pom.xml" />
+			<arg value="-Dfile=${maven-jar}" />
+		</artifact:mvn>
+	</target>
+
+	<target name="stage" depends="dist,ivy-pom" description="deploy release version to Maven staging repository">
+		<!-- sign and deploy the main artifact -->
+		<artifact:mvn>
+			<arg value="org.apache.maven.plugins:maven-gpg-plugin:1.3:sign-and-deploy-file" />
+			<arg value="-Durl=${maven-staging-repository-url}" />
+			<arg value="-DrepositoryId=${maven-staging-repository-id}" />
+			<arg value="-DpomFile=pom.xml" />
+			<arg value="-Dfile=${maven-jar}" />
+			<arg value="-Pgpg" />
+		</artifact:mvn>
+
+		<!-- sign and deploy the sources artifact -->
+		<artifact:mvn>
+			<arg value="org.apache.maven.plugins:maven-gpg-plugin:1.3:sign-and-deploy-file" />
+			<arg value="-Durl=${maven-staging-repository-url}" />
+			<arg value="-DrepositoryId=${maven-staging-repository-id}" />
+			<arg value="-DpomFile=pom.xml" />
+			<arg value="-Dfile=${maven-sources-jar}" />
+			<arg value="-Dclassifier=sources" />
+			<arg value="-Pgpg" />
+		</artifact:mvn>
+
+		<!-- sign and deploy the javadoc artifact -->
+		<artifact:mvn>
+			<arg value="org.apache.maven.plugins:maven-gpg-plugin:1.3:sign-and-deploy-file" />
+			<arg value="-Durl=${maven-staging-repository-url}" />
+			<arg value="-DrepositoryId=${maven-staging-repository-id}" />
+			<arg value="-DpomFile=pom.xml" />
+			<arg value="-Dfile=${maven-javadoc-jar}" />
+			<arg value="-Dclassifier=javadoc" />
+			<arg value="-Pgpg" />
+		</artifact:mvn>
+	</target>
+
+	<!-- ************************************** END OF MAVEN SH*T ************************************** -->
+
+	<property name="subdir"    value=""/>
+
+	<!-- ************		SOURCE		********************* -->
+	<target name="init">
+		<available property="ivy.set.up" file="${jars.dir}"/>
+		<fail message="It appears that Ivy has not been set up properly. Please run &quot;ant ivy-setupjars&quot; and try again." unless="ivy.set.up"/>
+		<mkdir dir="${build}"/>
+		<mkdir dir="${docs}"/>
+		<mkdir dir="${dist}"/>
+		<mkdir dir="${reports}"/>
+		<mkdir dir="${coverage}"/>
+		<mkdir dir="${instrumented}"/>
+		<mkdir dir="${dist}/lib"/>
+		<symlink link="${dist}/javadoc" resource="../${docs}" overwrite="true"/>
+	</target>
+
+	<target name="compile" depends="init" description="Compile sources (without tests)">
+		<javac srcdir="${src}" debug="on" optimize="on" destdir="${build}" encoding="UTF-8" source="1.6" target="1.6" classpathref="compile.classpath"/>
+	</target>
+
+	<target name="compile-tests" depends="init" description="Compile sources (with tests)">
+		<javac srcdir="${src}:${test}:${slow}" debug="on" optimize="on" destdir="${build}" encoding="UTF-8" source="1.6" target="1.6" classpathref="test.classpath"/>
+	</target>
+
+	<target name="jar" depends="compile" description="Creates jar (without tests)">
+		<jar jarfile="wikicategories-${version}.jar">
+			<fileset dir="${build}"/>
+			<fileset dir="${src}" includes="**/*.8"/>
+			<fileset dir="${src}" includes="**/*.16"/>
+			<fileset dir="${src}" includes="**/*.12"/>
+		</jar>
+	</target>
+
+	<target name="jar-tests" depends="compile-tests" description="Creates jar (with tests)">
+		<jar jarfile="wikicategories-${version}.jar">
+			<fileset dir="${build}"/>
+			<fileset dir="${test}" includes="**/*.html"/>
+			<fileset dir="${src}" includes="**/*.8"/>
+			<fileset dir="${src}" includes="**/*.16"/>
+			<fileset dir="${src}" includes="**/*.12"/>
+		</jar>
+	</target>
+
+	<!-- ************		JAVADOC		********************* -->
+	<target name="javadoc" description="Generates documentation">
+		<delete dir="${docs}"/>
+		<mkdir dir="${docs}"/>
+		<javadoc destdir="${docs}"
+					encoding="UTF-8"
+					sourcepath="${src}"
+					packagenames="it.unimi.dsi.*"
+					private="off"
+					overview="${src}/overview.html"
+					source="1.6"
+					windowtitle="dsiutil ${version}"
+					classpathref="compile.classpath">
+			<link href="${j2se.apiurl}"/>
+			<link href="${fastutil.apiurl}"/>
+			<link href="${jsap.apiurl}"/>
+			<link href="${junit.apiurl}"/>
+			<link href="${slf4j.apiurl}"/>
+			<link href="${log4j.apiurl}"/>
+			<link href="${guava.apiurl}"/>
+			<link href="${commons-io.apiurl}"/>
+			<link href="${commons-lang.apiurl}"/>
+			<link href="${commons-configuration.apiurl}"/>
+			<link href="${commons-collections.apiurl}"/>
+			<link href="${commons-math3.apiurl}"/>
+		</javadoc>
+	</target>
+
+
+	<target name="junit" depends="instrument" description="Runs JUnit tests">
+
+		<junit printsummary="yes" fork="yes" haltonfailure="off"  haltonerror="off">
+			<classpath>
+				<path refid="test.classpath" />
+				<pathelement location="${instrumented}/classes"/>
+				<pathelement location="${build}"/>
+				<pathelement location="${src}"/>
+				<pathelement location="${test}"/>
+			</classpath>
+
+	 <assertions><enable/></assertions>
+
+			<jvmarg value="-Demma.coverage.out.file=${coverage}/coverage.emma" />
+			<jvmarg value="-Demma.coverage.out.merge=true" />
+
+			<formatter type="xml"/>
+			<formatter type="plain"/>
+
+			<batchtest fork="yes" todir="${reports}">
+				<fileset dir="${instrumented}/classes">
+					<include name="**/*Test.class"/>
+					<exclude name="**/*SlowTest.class"/>
+					<exclude name="it/unimi/dsi/test/*"/>
+				</fileset>
+			</batchtest>
+		</junit>
+
+		<junitreport todir="reports">
+			<fileset dir="reports">
+				<include name="TEST-*.xml"/>
+			</fileset>
+			<report todir="reports/html"/>
+		</junitreport>
+
+		<emma>
+			<report sourcepath="${src}" >
+				<fileset file="${coverage}/*a"/>
+				<html outfile="coverage.html" />
+				<xml outfile="${coverage}/coverage.xml" />
+			</report>
+		</emma>
+	</target>
+
+	<target name="junit-slow" depends="instrument" description="Runs slow JUnit tests">
+
+		<junit printsummary="yes" fork="yes" haltonfailure="off"  haltonerror="off">
+			<classpath>
+				<path refid="test.classpath" />
+				<pathelement location="${instrumented}/classes"/>
+				<pathelement location="${build}"/>
+				<pathelement location="${src}"/>
+				<pathelement location="${slow}"/>
+			</classpath>
+
+	 <assertions><enable/></assertions>
+
+			<jvmarg value="-Demma.coverage.out.file=${coverage}/coverage.emma" />
+			<jvmarg value="-Demma.coverage.out.merge=true" />
+
+			<formatter type="xml"/>
+			<formatter type="plain"/>
+
+			<batchtest fork="yes" todir="${reports}">
+				<fileset dir="${instrumented}/classes">
+					<include name="**/*SlowTest.class"/>
+					<exclude name="it/unimi/dsi/test/*"/>
+				</fileset>
+			</batchtest>
+		</junit>
+
+		<junitreport todir="reports">
+			<fileset dir="reports">
+				<include name="TEST-*.xml"/>
+			</fileset>
+			<report todir="reports/html"/>
+		</junitreport>
+
+		<emma>
+			<report sourcepath="${src}" >
+				<fileset file="${coverage}/*a"/>
+				<html outfile="coverage.html" />
+				<xml outfile="${coverage}/coverage.xml" />
+			</report>
+		</emma>
+	</target>
+
+	<target name="instrument" depends="compile-tests" description="Generate instrumented classes">
+		<taskdef resource="emma_ant.properties" classpathref="test.classpath"/>
+
+		<emma>
+			<instr mode="fullcopy"
+				 outdir="${instrumented}"
+				 merge="no"
+				 metadatafile="${coverage}/metadata.emma"
+				 instrpath="${build}"
+			>
+				<filter excludes="*Test*"/>
+			</instr>
+		</emma>
+	</target>
+
+	<!-- ************		CLEAN		********************* -->
+	<target name="clean">
+		<delete dir="${build}"/>
+		<delete dir="${dist}"/>
+		<delete dir="${reports}"/>
+		<delete dir="${coverage}"/>
+		<delete dir="${instrumented}"/>
+		<delete dir="${docs}"/>
+		<delete>
+			<fileset dir="." includes="*.jar"/>
+		</delete>
+  </target>
+
+
+  	<!-- snapshot stuff -->
+
+	<target name="snapshot" description="Publishes a snapshot version on jars.law.di.unimi.it" depends="jar">
+		<move file="wikicategories-${version}.jar" tofile="${build}/wikicategories-${version}-SNAPSHOT.jar"/>
+		<ivy:resolve/>
+		<ivy:deliver deliverpattern="${build}/[artifact]-[revision].[ext]" pubrevision="${version}-SNAPSHOT" status="integration"/>
+		<ivy:makepom ivyfile="${build}/ivy-${version}-SNAPSHOT.xml" pomfile="${build}/wikicategories-${version}-SNAPSHOT.pom"/>
+		<ivy:publish resolver="law-snapshots" pubrevision="${version}-SNAPSHOT" overwrite="true" publishivy="false">
+			<artifacts pattern="${build}/[artifact]-[revision].[ext]"/>
+		</ivy:publish>
+	</target>
+
+</project>
+
diff --git a/java/ivy.xml b/java/ivy.xml
new file mode 100644
index 0000000..2db5e40
--- /dev/null
+++ b/java/ivy.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<ivy-module version="2.0">
+	<info organisation="it.unimi.di" module="wikicategories"/>
+
+	<configurations defaultconf="compile" defaultconfmapping="*->default">
+		<conf name="compile"/>
+		<conf name="runtime" extends="compile"/>
+		<conf name="test" extends="runtime"/>
+	</configurations>
+    
+    <publications>
+		<artifact name="wikicategories" type="jar"/>
+		<artifact name="wikicategories" type="pom"/>
+	</publications>
+	
+	
+    
+	<dependencies>
+		<dependency org="it.unimi.di" name="mg4j-big" rev="5.2.1">
+			<exclude org="com.sun.jdmk"/>
+			<exclude org="com.sun.jmx"/>
+			<exclude org="javax.jms"/>
+		</dependency>
+		<dependency org="it.unimi.di" name="archive4j" rev="1.3.3"/>
+		<dependency org="it.unimi.dsi" name="webgraph-big" rev="3.3.5"/>
+		<dependency org="it.unimi.dsi" name="fastutil" rev="6.3"/>
+		<dependency org="it.unimi.dsi" name="webgraph" rev="3.4.0"/>
+		<dependency org="it.unimi.dsi" name="dsiutils" rev="2.2.0"/>
+		<dependency org="info.bliki.wiki" name="bliki-core" rev="3.0.19"/>
+		<dependency org="commons-lang" name="commons-lang" rev="2.3"/>        
+		<dependency org="org.apache.commons" name="commons-math3" rev="3.0"/>
+		<dependency org="commons-collections" name="commons-collections" rev="3.0"/>
+		<dependency org="colt" name="colt" rev="1.0.3"/>
+		<dependency org="com.martiansoftware" name="jsap" rev="2.1"/>
+		<dependency org="org.slf4j" name="slf4j-api" rev="1.7.7"/>
+		
+		<exclude org="log4j"/>
+		<exclude org="org.slf4j" artifact="slf4j-log4j12"/>
+ 	</dependencies>
+</ivy-module>
diff --git a/java/setcp.sh b/java/setcp.sh
new file mode 100755
index 0000000..bc23aa3
--- /dev/null
+++ b/java/setcp.sh
@@ -0,0 +1,14 @@
+JAR=wikicategories
+
+sourcedir=$(cd $(dirname ${BASH_ARGV[0]}) && pwd) 
+count=$(\ls -1 $sourcedir/$JAR-*.jar 2>/dev/null | wc -l)
+
+if (( count == 0 )); then
+	echo "WARNING: no $JAR jar file."
+elif (( count > 1 )); then
+	echo "WARNING: several $JAR jar files ($(\ls -m $JAR-*.jar))"
+else
+	export CLASSPATH=$(ls -1 $sourcedir/$JAR-*.jar | tail -n 1):$CLASSPATH
+fi
+
+export CLASSPATH=$CLASSPATH:$(\ls -1 $sourcedir/jars/runtime/*.jar | paste -d: -s -)
diff --git a/java/src/it/unimi/di/wikipedia/categories/CategorySelector.java b/java/src/it/unimi/di/wikipedia/categories/CategorySelector.java
new file mode 100644
index 0000000..9e6b4f9
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/categories/CategorySelector.java
@@ -0,0 +1,224 @@
+package it.unimi.di.wikipedia.categories;
+
+import it.unimi.di.wikipedia.utils.MapUtils;
+import it.unimi.dsi.Util;
+import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
+import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap;
+import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
+import it.unimi.dsi.fastutil.ints.IntArrays;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+import it.unimi.dsi.fastutil.ints.IntSet;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.webgraph.ImmutableGraph;
+import it.unimi.dsi.webgraph.Transform;
+import it.unimi.dsi.webgraph.algo.GeometricCentralities;
+
+import java.io.PrintWriter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import cern.colt.Arrays;
+
+import com.martiansoftware.jsap.FlaggedOption;
+import com.martiansoftware.jsap.JSAP;
+import com.martiansoftware.jsap.JSAPResult;
+import com.martiansoftware.jsap.Parameter;
+import com.martiansoftware.jsap.SimpleJSAP;
+import com.martiansoftware.jsap.UnflaggedOption;
+
+public class CategorySelector {
+	final static Logger LOGGER = LoggerFactory.getLogger(CategorySelector.class);
+	
+	// Input data
+	private final ImmutableGraph wcg, transposedWcg;
+	private final Int2ObjectMap<String> catId2name;
+	public final int numOriginalCat, numFinalCat;
+	public final String[] excludedStrings;
+	
+	// Output data
+	public int[] orderedCatIds;
+	private Int2DoubleMap catId2rank;
+	private IntSet milestones, excludedCatIds;
+	
+	public CategorySelector(ImmutableGraph wcg, Int2ObjectMap<String> catId2name, int numFinalCat, String[] excludedStrings) {
+		this.wcg = wcg;
+		this.transposedWcg = Transform.transpose(wcg);
+		this.catId2name = catId2name;
+		this.numOriginalCat = wcg.numNodes();
+		this.numFinalCat = numFinalCat;
+		this.excludedStrings = excludedStrings;
+		
+		LOGGER.debug("Examples from the provided Wikipedia Category Graph: ");
+		for (int i = 0; i < 10; i++) {
+			int cat = (int) (Math.random() * numOriginalCat);
+			LOGGER.debug( "\"" + catId2name.get(cat) + "\" is listed as a subcategory of \""
+					+ catId2name.get(this.wcg.successors(cat).nextInt()) + "\"");
+		}
+	}
+	
+	private static IntSet findCategoriesContainingStrings(final Int2ObjectMap<String> catId2name, final String[] lowercasedString) {
+		IntSet results = new IntOpenHashSet();
+		String name;
+		for (Int2ObjectMap.Entry<String> c2n : catId2name.int2ObjectEntrySet()) {
+			name = c2n.getValue().toLowerCase();
+			for (String string : lowercasedString)
+				if (name.indexOf(string) != -1) {
+					results.add(c2n.getIntKey());
+					break;
+				}
+		}
+		return results;
+	}
+	
+	public void compute() {
+		LOGGER.info("Ranking nodes...");
+		final GeometricCentralities ranker = new GeometricCentralities(transposedWcg, new ProgressLogger(LOGGER));
+		try {
+			ranker.compute();
+		} catch (InterruptedException e) { throw new RuntimeException(e); }
+		catId2rank = new Int2DoubleOpenHashMap(Util.identity(numOriginalCat), ranker.harmonic);
+		LOGGER.info("Nodes ranked.");
+		
+		LOGGER.info("Excluding categories containing " + Arrays.toString(excludedStrings) + "...");
+		excludedCatIds = findCategoriesContainingStrings(catId2name, excludedStrings);
+		for (int catIdToExclude : excludedCatIds)
+			catId2rank.put(catIdToExclude, Double.NEGATIVE_INFINITY);
+		LOGGER.info(excludedCatIds.size() + " categories excluded, e.g. \"" + catId2name.get(excludedCatIds.toIntArray()[0]) + "\".");
+		
+		LOGGER.info("Ordering categories by centrality and selecting milestones...");
+		orderedCatIds = Util.identity(numOriginalCat);
+		IntArrays.quickSort(orderedCatIds, MapUtils.comparatorPuttingLargestMappedValueFirst(catId2rank));
+		milestones = new IntOpenHashSet(IntArrays.trim(orderedCatIds, numFinalCat));
+		LOGGER.info(milestones.size() + " milestones selected. 1st category: " + catId2name.get(orderedCatIds[0]));
+	}
+	
+	public Int2ObjectMap<IntSet> recategorize(final Int2ObjectMap<IntSet> page2cat) {
+		LOGGER.info("Computing closest milestones...");
+		final int[] closestMilestones = new HittingDistanceMinimizer(transposedWcg, milestones).compute();
+		LOGGER.info("Closest milestones computed, printing a sample:");
+		for (int i = 0; i < 10; i++) {
+			int cat = (int) (Math.random() * numOriginalCat);
+			System.out.println( "\"" + catId2name.get(cat) + "\" has been ramapped to \""
+					+ catId2name.get(closestMilestones[cat]) + "\"");
+		}
+			
+		ProgressLogger pl = new ProgressLogger(LOGGER, "pages");
+		pl.expectedUpdates = page2cat.size();
+		pl.start("Moving old categories to closest milestones...");
+		Int2ObjectMap<IntSet> page2newCat = new Int2ObjectOpenHashMap<IntSet>(page2cat.size());
+		for (Int2ObjectMap.Entry<IntSet> p2c : page2cat.int2ObjectEntrySet()) {
+			IntSet newCategories = new IntOpenHashSet();
+			int milestone;
+			for (int cat : p2c.getValue()) {
+				if (cat < 0 || cat >= numOriginalCat)
+					LOGGER.error("Category #" + cat + " is not listed in the Wikipedia Category Graph"
+							+ " (it has only " + numOriginalCat + " nodes).");
+				else {
+					milestone = closestMilestones[cat];
+					if (milestone != -1) {
+						if (!milestones.contains(milestone))
+							throw new IllegalStateException(milestone + " is not a milestone.");
+						newCategories.add(milestone);
+					}
+				}
+			}
+			page2newCat.put(p2c.getIntKey(), newCategories);
+			pl.lightUpdate();
+		}
+		pl.done();
+		
+		return page2newCat;
+	}
+	
+	private String[] toSortedNames(IntSet categories) {
+		String[] names = new String[categories.size()];
+		int[] sortedCat = categories.toIntArray();
+		IntArrays.quickSort(sortedCat, MapUtils.comparatorPuttingLargestMappedValueFirst(catId2rank));
+		for (int i = 0; i < sortedCat.length; i++) names[i] = catId2name.get(sortedCat[i]);
+		return names;
+	}
+	
+	@SuppressWarnings({ "unchecked" })
+	public static void main( String rawArguments[] ) throws Exception  {
+		SimpleJSAP jsap = new SimpleJSAP( CategorySelector.class.getName(), 
+				"Cleanse the wikipedia categorization system.",
+				new Parameter[] {
+						new UnflaggedOption( "WCG", 
+							JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY,
+							"The BVGraph basename of the wikipedia category graph." ),	
+						new UnflaggedOption( "page2cat", 
+								JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY,
+								"The serialized int 2 intset that represents set of categories for each page." ),	
+						new UnflaggedOption( "pageNames", 
+								JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY,
+								"The serialized Int2ObjectMap<String> file with association of categories to their names." ),
+						new UnflaggedOption( "catNames", 
+								JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY,
+								"The serialized Int2ObjectMap<String> file with association of categories to their names." ),		
+						new FlaggedOption( "exclude", 
+								JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 
+								'e', "exclude", 
+								"Exclude all those categories whose LOWERCASED name contains one of the provided strings." )
+								.setAllowMultipleDeclarations(true),
+						new UnflaggedOption( "C", 
+								JSAP.INTEGER_PARSER, "10000", JSAP.REQUIRED, JSAP.NOT_GREEDY,
+								"Number of categories to retain." ),	
+						new UnflaggedOption( "output-rankedcat",
+								JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, 
+								"Where the output (ordered) category 2 score TSV file will be saved."
+								),
+						new UnflaggedOption( "output-page2cat",
+								JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, 
+								"Where the output page2cat TSV file will be saved."
+								),
+					}
+				);
+				
+		final JSAPResult args = jsap.parse( rawArguments );
+		if ( jsap.messagePrinted() ) System.exit( 1 );
+		
+		LOGGER.info("Reading input files...");
+		Int2ObjectMap<String> catNames = (Int2ObjectMap<String>) BinIO.loadObject(args.getString("catNames"));
+		Int2ObjectMap<String> pageNames = (Int2ObjectMap<String>) BinIO.loadObject(args.getString("pageNames"));
+		Int2ObjectMap<IntSet> page2cat = (Int2ObjectMap<IntSet>) BinIO.loadObject(args.getString("page2cat"));
+		ImmutableGraph wcg = ImmutableGraph.load(args.getString("WCG"));
+		final int numFinalCat = args.getInt("C");
+		
+		CategorySelector categorySelector = new CategorySelector(wcg, catNames, numFinalCat, args.getStringArray("exclude"));
+		categorySelector.compute();
+		
+		LOGGER.info("Writing rankings to " + args.getString("output-rankedcat") + "...");
+		PrintWriter out = new PrintWriter(args.getString("output-rankedcat"));
+		for (int c : categorySelector.orderedCatIds) {
+			out.print(catNames.get(c));
+			out.print("\t");
+			out.print(Double.toString(categorySelector.catId2rank.get(c)));
+			out.println();
+		}
+		out.close();
+		
+		
+		Int2ObjectMap<IntSet> newPage2cat = categorySelector.recategorize(page2cat);
+		
+		LOGGER.info("Writing new page2cat map to " + args.getString("output-page2cat") + "...");
+		out = new PrintWriter(args.getString("output-page2cat"));
+		ProgressLogger pl = new ProgressLogger(LOGGER, "pages");
+		for (Int2ObjectMap.Entry<IntSet> p2c : newPage2cat.int2ObjectEntrySet()) {
+			out.print(pageNames.get(p2c.getIntKey()));
+			out.print("\t");
+			for (String c : categorySelector.toSortedNames(p2c.getValue())) out.print(c + "\t");
+			out.println();
+			pl.lightUpdate();
+		}
+		pl.done();
+		out.close();
+
+	}
+	
+	
+	
+
+}
diff --git a/java/src/it/unimi/di/wikipedia/categories/HittingDistanceMinimizer.java b/java/src/it/unimi/di/wikipedia/categories/HittingDistanceMinimizer.java
new file mode 100644
index 0000000..fa7defb
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/categories/HittingDistanceMinimizer.java
@@ -0,0 +1,150 @@
+package it.unimi.di.wikipedia.categories;
+
+import it.unimi.dsi.fastutil.ints.IntArrayFIFOQueue;
+import it.unimi.dsi.fastutil.ints.IntArrayPriorityQueue;
+import it.unimi.dsi.fastutil.ints.IntPriorityQueue;
+import it.unimi.dsi.fastutil.ints.IntSet;
+import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
+import it.unimi.dsi.fastutil.objects.ObjectSet;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.webgraph.ImmutableGraph;
+import it.unimi.dsi.webgraph.LazyIntIterator;
+
+import java.util.Arrays;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class HittingDistanceMinimizer {
+	public static final Logger LOGGER = LoggerFactory.getLogger(HittingDistanceMinimizer.class);
+	
+	final ImmutableGraph transposed;
+	final int[] minMilestoneDistance;
+	final int[] closestMilestone;
+	final IntSet milestones;
+	final ObjectSet<Visitor> runningVisitors;
+	final IntPriorityQueue milestoneQueue;
+	final ProgressLogger pl;
+	
+	public HittingDistanceMinimizer(ImmutableGraph transposedGraph, IntSet milestones) {
+		this.transposed = transposedGraph;
+		this.milestones = milestones;
+		minMilestoneDistance = new int[transposedGraph.numNodes()];
+		Arrays.fill(minMilestoneDistance, Integer.MAX_VALUE);
+		closestMilestone = new int[transposedGraph.numNodes()];
+		Arrays.fill(closestMilestone, -1);
+		milestoneQueue = new IntArrayPriorityQueue(milestones.toIntArray());
+		runningVisitors = new ObjectOpenHashSet<Visitor>();
+		pl  = new ProgressLogger(LOGGER, "milestones");
+		pl.expectedUpdates = milestones.size();
+		
+	}
+	
+	private class Visitor extends Thread {
+		
+		final int start;
+		final int[] dists;
+		final ImmutableGraph graph;
+		
+		Visitor(final ImmutableGraph graph, int startingNode) {
+			this.start = startingNode;
+			dists = new int[ graph.numNodes() ];
+			this.graph = graph.copy();
+		}
+		
+		@Override
+		public void run() {
+			final IntArrayFIFOQueue queue = new IntArrayFIFOQueue();
+			
+			Arrays.fill( dists, Integer.MAX_VALUE ); // Initially, all distances are infinity.
+			
+			int curr, succ;
+			queue.enqueue( start );
+			dists[ start ] = 0;
+
+			LazyIntIterator successors;
+
+			while( ! queue.isEmpty() ) {
+				curr = queue.dequeueInt();
+				successors = graph.successors( curr );
+				int d = graph.outdegree( curr );
+				while( d-- != 0 ) {
+					succ = successors.nextInt();
+					if ( dists[ succ ] == Integer.MAX_VALUE  ) {
+						dists[ succ ] = dists[ curr ] + 1;
+						queue.enqueue( succ );
+					}
+				}
+			}
+			
+			startNewThreadAfter(this);
+		}
+		
+		@Override
+		public int hashCode() { return start; }
+		
+		@Override
+		public boolean equals(Object o) {
+			return (((o instanceof Visitor)) &&  ((Visitor) o).start == this.start);
+		}
+	}
+	
+	private synchronized void startNewThreadAfter(Visitor thread) {
+		if (thread != null) {
+			if (!runningVisitors.remove(thread)) {
+				throw new IllegalStateException(
+						"Thread " + thread + " signaled completion but was not present.");
+			}
+			updateClosestMilestonesAfter(thread.start, thread.dists);
+			pl.update();
+		}
+		
+		if (!milestoneQueue.isEmpty()) {
+			int milestone = milestoneQueue.dequeueInt();
+			Visitor visitor = new Visitor(transposed, milestone);
+			runningVisitors.add(visitor);
+			visitor.start();
+		} else 
+			if (runningVisitors.isEmpty()) {
+				synchronized (this) {
+					this.notifyAll();
+				}
+			}
+	}
+	
+
+	private void updateClosestMilestonesAfter(int milestone, int[] distances) {
+		final int numNodes = transposed.numNodes();
+		for (int node = 0; node < numNodes; node++) {
+			if (distances[node] < minMilestoneDistance[node]) {
+				minMilestoneDistance[node] = distances[node];
+				closestMilestone[node] = milestone;
+			}
+		}
+	}
+	
+	public int[] compute() {
+		return compute(Runtime.getRuntime().availableProcessors());
+	}
+	
+	public int[] compute(int nOfThreads) {
+		pl.start("Starting a BFS for each milestone (with " + nOfThreads + " parallel threads)...");
+		for (int i = 0; i < nOfThreads; i++) {
+			startNewThreadAfter(null);
+		}
+		try {
+			synchronized (this) {
+				while (!milestoneQueue.isEmpty())
+					this.wait();
+			}
+		} catch (InterruptedException e) { throw new RuntimeException(e); }
+		
+		pl.done();
+		
+		return closestMilestone;
+		
+	}
+
+	
+}
diff --git a/java/src/it/unimi/di/wikipedia/parsing/DocumentSequenceImmutableGraph.java b/java/src/it/unimi/di/wikipedia/parsing/DocumentSequenceImmutableGraph.java
new file mode 100644
index 0000000..917091b
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/parsing/DocumentSequenceImmutableGraph.java
@@ -0,0 +1,137 @@
+package it.unimi.di.wikipedia.parsing;
+
+import it.unimi.di.big.mg4j.document.Document;
+import it.unimi.di.big.mg4j.document.DocumentIterator;
+import it.unimi.di.big.mg4j.document.DocumentSequence;
+import it.unimi.di.big.mg4j.tool.Scan;
+import it.unimi.di.big.mg4j.tool.Scan.VirtualDocumentFragment;
+import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver;
+import it.unimi.dsi.fastutil.longs.LongAVLTreeSet;
+import it.unimi.dsi.fastutil.longs.LongSortedSet;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.objects.ObjectIterator;
+import it.unimi.dsi.fastutil.objects.ObjectList;
+import it.unimi.dsi.big.webgraph.ImmutableGraph;
+import it.unimi.dsi.big.webgraph.ImmutableSequentialGraph;
+import it.unimi.dsi.big.webgraph.NodeIterator;
+
+import java.io.IOException;
+import java.util.NoSuchElementException;
+
+/** Exposes a document sequence as a (sequentially accessible) immutable graph, according to some
+*  virtual field provided by the documents in the sequence. A suitable {@link VirtualDocumentResolver}
+*  is used to associate node numbers to each fragment.
+*  
+*  <p>More precisely, the graph will have as many nodes as there are documents in the sequence, the
+*  <var>k</var>-th document (starting from 0) representing node number <var>k</var>.
+*  The successors of a document are obtained by extracting the virtual field from the
+*  document, turning each {@linkplain it.unimi.di.mg4j.tool.Scan.VirtualDocumentFragment document specifier}
+*  into a document number (using the given {@linkplain VirtualDocumentResolver resolver},
+*  and discarding unresolved URLs).
+*/
+public class DocumentSequenceImmutableGraph extends ImmutableSequentialGraph {
+	
+	/** The underlying sequence. */
+	private DocumentSequence sequence;
+	/** The number of the virtual field to be used. */
+	private int virtualField;
+	/** The resolver to be used. */
+	private VirtualDocumentResolver resolver;
+	
+	/** Creates an immutable graph from a sequence.
+	 * 
+	 * @param sequence the sequence whence the immutable graph should be created.
+	 * @param virtualField the number of the virtual field to be used to get the successors from.
+	 * @param resolver the resolver to be used to map document specs to node numbers.
+	 */
+	public DocumentSequenceImmutableGraph( final DocumentSequence sequence, final int virtualField, final VirtualDocumentResolver resolver ) {
+		this.sequence = sequence;
+		this.virtualField = virtualField;
+		this.resolver = resolver;
+	}
+
+	/** Creates a new immutable graph with the specified arguments.
+	 * 
+	 * @param arg a 3-element array: the first is the basename of a {@link DocumentSequence}, the second is an integer specifying the virtual
+	 * field number, the third is the basename of a {@link VirtualDocumentResolver}.
+	 */
+	public DocumentSequenceImmutableGraph( final String... arg ) throws IOException, ClassNotFoundException {
+		this( (DocumentSequence)BinIO.loadObject( arg[ 0 ] ), Integer.parseInt( arg[ 1 ] ), (VirtualDocumentResolver)BinIO.loadObject( arg[ 2 ] ) );
+	}
+	
+	@Override
+	public ImmutableGraph copy() {
+		throw new UnsupportedOperationException();
+	}
+
+	@Override
+	public long numNodes() {
+		if ( resolver.numberOfDocuments() > Integer.MAX_VALUE ) throw new IllegalArgumentException();
+		return resolver.numberOfDocuments();
+	}
+
+	@Override
+	public boolean randomAccess() {
+		return false;
+	}
+	
+	public NodeIterator nodeIterator() {
+		try {
+			final DocumentIterator documentIterator = sequence.iterator();
+			return new NodeIterator() {
+				Document cachedDocument = documentIterator.nextDocument();
+				int cachedDocumentNumber = 0;
+				long[] cachedSuccessors;
+				LongSortedSet succ = new LongAVLTreeSet();
+
+				public boolean hasNext() {
+					return cachedDocument != null;
+				}
+				
+				@SuppressWarnings("unchecked")
+				public long nextLong() {
+					if ( !hasNext() ) throw new NoSuchElementException();
+					ObjectList<Scan.VirtualDocumentFragment> vdf;
+					try {
+						vdf = (ObjectList<VirtualDocumentFragment>)cachedDocument.content( virtualField );
+					}
+					catch ( IOException exc1 ) {
+						throw new RuntimeException( exc1 );
+					}
+					succ.clear();
+					resolver.context( cachedDocument );
+					ObjectIterator<VirtualDocumentFragment> it = vdf.iterator();
+					while ( it.hasNext() ) {
+						long successor = resolver.resolve( it.next().documentSpecifier() );
+						if ( successor >= 0 ) succ.add( successor );
+					}
+					cachedSuccessors = succ.toLongArray();
+					// Get ready for the next request
+					try {
+						cachedDocument.close();
+						cachedDocument = documentIterator.nextDocument();
+					}
+					catch ( IOException e ) {
+						throw new RuntimeException( e );
+					}
+					return cachedDocumentNumber++;
+				}
+
+				public long outdegree() {
+					return cachedSuccessors.length;
+				}
+				
+				
+				public long[][] successorBigArray() {
+					return new long[][] {cachedSuccessors};
+				}
+				
+			};
+		}
+		catch ( IOException e ) {
+			throw new RuntimeException( e );
+		}
+		
+	}
+
+}
diff --git a/java/src/it/unimi/di/wikipedia/parsing/NamespacedWikipediaDocumentSequence.java b/java/src/it/unimi/di/wikipedia/parsing/NamespacedWikipediaDocumentSequence.java
new file mode 100644
index 0000000..90f5a1e
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/parsing/NamespacedWikipediaDocumentSequence.java
@@ -0,0 +1,860 @@
+package it.unimi.di.wikipedia.parsing;
+
+
+/*	
+ * Modified version of:
+ * 	 
+ * MG4J: Managing Gigabytes for Java (big)
+ *
+ * Copyright (C) 2013 Sebastiano Vigna 
+ *
+ *  This library is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU Lesser General Public License as published by the Free
+ *  Software Foundation; either version 3 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This library is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+import info.bliki.wiki.filter.Encoder;
+import info.bliki.wiki.filter.HTMLConverter;
+import info.bliki.wiki.filter.PlainTextConverter;
+import info.bliki.wiki.model.WikiModel;
+import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
+import it.unimi.dsi.big.util.StringMap;
+import it.unimi.di.big.mg4j.document.AbstractDocument;
+import it.unimi.di.big.mg4j.document.AbstractDocumentFactory;
+import it.unimi.di.big.mg4j.document.AbstractDocumentIterator;
+import it.unimi.di.big.mg4j.document.AbstractDocumentSequence;
+import it.unimi.di.big.mg4j.document.CompositeDocumentFactory;
+import it.unimi.di.big.mg4j.document.Document;
+import it.unimi.di.big.mg4j.document.DocumentFactory;
+import it.unimi.di.big.mg4j.document.DocumentIterator;
+import it.unimi.di.big.mg4j.document.DocumentSequence;
+import it.unimi.di.big.mg4j.document.HtmlDocumentFactory;
+import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
+import it.unimi.di.big.mg4j.document.WikipediaDocumentCollection;
+import it.unimi.di.big.mg4j.tool.URLMPHVirtualDocumentResolver;
+import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver;
+import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor;
+import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor.Anchor;
+import it.unimi.dsi.bits.TransformationStrategies;
+import it.unimi.dsi.bits.TransformationStrategy;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
+import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;
+import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
+import it.unimi.dsi.fastutil.objects.Object2LongFunction;
+import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
+import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
+import it.unimi.dsi.fastutil.objects.ObjectBigList;
+import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
+import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
+import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
+import it.unimi.dsi.io.FastBufferedReader;
+import it.unimi.dsi.io.FileLinesCollection;
+import it.unimi.dsi.io.WordReader;
+import it.unimi.dsi.lang.MutableString;
+import it.unimi.dsi.lang.ObjectParser;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.sux4j.mph.MWHCFunction;
+import it.unimi.dsi.util.TextPattern;
+import it.unimi.dsi.webgraph.ImmutableGraph;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Serializable;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.html.HtmlEscapers;
+import com.martiansoftware.jsap.FlaggedOption;
+import com.martiansoftware.jsap.JSAP;
+import com.martiansoftware.jsap.JSAPException;
+import com.martiansoftware.jsap.JSAPResult;
+import com.martiansoftware.jsap.Parameter;
+import com.martiansoftware.jsap.SimpleJSAP;
+import com.martiansoftware.jsap.Switch;
+import com.martiansoftware.jsap.UnflaggedOption;
+
+/** A class exhibiting a standard Wikipedia XML dump as a {@link DocumentSequence}. 
+ * 
+ * <P><strong>Warning</strong>: this class has no connection whatsoever with
+ * {@link WikipediaDocumentCollection}.
+ * 
+ * <p>The purpose of this class is making the indexing of Wikipedia and of its entity
+ * graph starting from a pristine Wikipedia XML dump reasonably easy. There are a few
+ * steps involved, mainly due to the necessity of working out redirects, but the whole
+ * procedure can be carried out with very little resources. The class uses the
+ * {@link WikiModel#toHtml(String, Appendable, String, String)} method to convert
+ * the Wikipedia format into HTML, and then passes the result to a standard {@link HtmlDocumentFactory}
+ * (suggestion on alternative conversion methods are welcome).
+ * A few additional fields are handled by {@link WikipediaHeaderFactory}.
+ * 
+ * <p>Note that no properties are passed to the underlying {@link HtmlDocumentFactory}: if you want
+ * to set the anchor properties (see {@link HtmlDocumentFactory.MetadataKeys}), you need to use
+ * {@linkplain #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean, int, int, int) a quite humongous constructor}.
+ * 
+ * <h3>How to index Wikipedia</h3>
+ * 
+ * <p>As a first step, download the Wikipedia XML dump (it's the &ldquo;pages-articles&rdquo; file;
+ * it should start with a <samp>mediawiki</samp> opening tag). This class can process the
+ * file in its compressed form, but we suggest to uncompress it using <samp>bunzip2</samp>,
+ * as processing is an order of magnitude faster. (Note that the following process will exclude namespaced
+ * pages such as <samp>Template:<var>something</var></samp>; if you want to include them, you must 
+ * use a different {@linkplain #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean) constructor}.)
+ * 
+ * 
+ * <p>The first step is extracting metadata (in particular, the URLs that are necessary to
+ * index correctly the anchor text). We do not suggest specific Java options, but try to use
+ * as much memory as you can.
+ * <pre>
+ * java it.unimi.di.mg4j.tool.ScanMetadata \
+ *   -o "it.unimi.di.mg4j.document.WikipediaDocumentSequence(enwiki-latest-pages-articles.xml,false,http://en.wikipedia.org/wiki/,false)" \
+ *   -u enwiki.uris -t enwiki.titles
+ * </pre>
+ * 
+ * <p>Note that we used the {@link ObjectParser}-based constructor of this class, which makes it possible to create
+ * a {@link NamespacedWikipediaDocumentSequence} instance parsing a textual specification (see the 
+ * {@linkplain #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean) constructor}
+ * documentation for details about the parameters).
+ * 
+ * <p>The second step consists in building a first {@link VirtualDocumentResolver} which, however, 
+ * does not comprise redirect information:
+ * <pre>
+ * java it.unimi.di.mg4j.tool.URLMPHVirtualDocumentResolver -o enwiki.uris enwiki.vdr
+ * </pre>
+ * 
+ * <p>Now we need to use the <i>ad hoc</i> main method of this class to rescan the collection, gather the redirect
+ * information and merge it with our current resolver:
+ * <pre>
+ * java it.unimi.di.mg4j.document.WikipediaDocumentSequence \
+ *   enwiki-latest-pages-articles.xml http://en.wikipedia.org/wiki/ enwiki.uris enwiki.vdr enwikired.vdr
+ * </pre>
+ * 
+ * <p>During this phase a quite large number of warnings about <em>failed redirects</em> might appear. This is normal,
+ * in particular if you do not index template pages. If you suspect an actual bug, try first to index template pages,
+ * too. Failed redirects should be in the order of few thousands, and all due to internal inconsistencies of
+ * the dump: to check that this is the case, check whether the target of a failed redirect appears as a page
+ * title (it shouldn't).
+ *   
+ * <p>We have now all information required to build a complete index (we use the Porter2 stemmer in this example):
+ * <pre>
+ * java it.unimi.di.mg4j.tool.IndexBuilder \
+ *   -o "it.unimi.di.mg4j.document.WikipediaDocumentSequence(enwiki.xml,false,http://en.wikipedia.org/wiki/,true)" \ 
+ *   --all-fields -v enwiki.vdr -t EnglishStemmer enwiki
+ * </pre>
+ * 
+ * <p>Finally, we can build the entity graph using a bridge class that exposes any {@link DocumentSequence} with a virtual
+ * field as an {@link ImmutableGraph} of the <a href="http://webgraph.di.unimi.it/">WebGraph framework</a> (the nodes will be in one-to-one correspondence with the documents
+ * returned by the index):
+ * <pre>
+ * java it.unimi.dsi.big.webgraph.BVGraph \
+ *   -s "it.unimi.di.mg4j.util.DocumentSequenceImmutableSequentialGraph(\"it.unimi.di.mg4j.document.WikipediaDocumentSequence(enwiki.xml,false,http://en.wikipedia.org/wiki/,true)\",anchor,enwikired.vdr)" \ 
+ *   enwiki
+ * </pre>
+ * 
+ * <h2>Additional fields</h2>
+ * 
+ * <p>The additional fields generated by this class (some of which are a bit hacky) are:
+ *
+ * <dl>
+ * <dt><samp>title</samp>
+ * <dd>the title of the Wikipedia page;
+ * <dt><samp>id</samp>
+ * <dd>a payload index containing the Wikipedia identifier of the page;
+ * <dt><samp>lastedit</samp>
+ * <dd>a payload index containing the last edit of the page;
+ * <dt><samp>category</samp>
+ * <dd>a field containing the categories of the page, separated by an artificial marker <samp>OXOXO</samp> (so when you look for a category as a phrase you
+ * don't get false cross-category positives);
+ * <dt><samp>firstpar</samp>
+ * <dd>a heuristically generated first paragraph of the page, useful for identification beyond the title;
+ * <dt><samp>redirects</samp>
+ * <dd>a virtual field treating the link of the page with its title and any redirect link to the page as an anchor: in practice, the
+ * field contains all names under which the page is known in Wikipedia. 
+ * </dl>
+ * 
+ * <p>Note that for each link in a disambiguation page this class will generate a fake link with the same target, but
+ * the title of the disambiguation page as text. This is in the same spirit of the <samp>redirects</samp> field&mdash;we enrich
+ * the HTML <samp>anchor</samp> field with useful information without altering the generated graph.
+ */
+
+public class NamespacedWikipediaDocumentSequence extends AbstractDocumentSequence implements Serializable {
+	private static final Logger LOGGER = LoggerFactory.getLogger( NamespacedWikipediaDocumentSequence.class );
+	private static final long serialVersionUID = 1L;
+
+	private static final TextPattern CATEGORY_START = new TextPattern( "[[Category:" );
+	private static final TextPattern BRACKETS_CLOSED = new TextPattern( "]]" );
+	private static final TextPattern BRACES_CLOSED = new TextPattern( "}}" );
+	private static final TextPattern DISAMBIGUATION = new TextPattern( "{{disambiguation" );
+	private static final TextPattern BRACKETS_OPEN = new TextPattern( "[[" );
+	private static final char[] END_OF_DISAMBIGUATION_LINK = new char[] { '|', ']' };
+	
+	/** A marker used to denote end of input. */
+	private static final DocumentAndFactory END = new DocumentAndFactory( null,  null );
+	/** The prototype {@link CompositeDocumentFactory} used to parse Wikipedia pages. */
+	private final DocumentFactory factory;
+	/** Whether the input is compressed with <samp>bzip2</samp>. */
+	private final boolean bzipped;
+	/** Whether to parse text (e.g., we do not parse text when computing titles/URIs). */
+	private final boolean parseText;
+	/** Whether to keep in the index namespace pages. */
+	private final boolean keepNamespaced;
+	/** The Wikipedia XML dump. */
+	private final String wikipediaXmlDump;
+	/** The base URL for pages (e.g., <samp>http://en.wikipedi.org/wiki/</samp>). */
+	private final String baseURL;
+	/** {@link #baseURL} concatenated with <samp>${title}</samp>. */
+	private final String linkBaseURL;
+	/** {@link #baseURL} concatenated with <samp>${image}</samp>. */
+	private final String imageBaseURL;
+	/** The set of namespaces specified in {@link #wikipediaXmlDump}. */
+	private ImmutableSet<MutableString> nameSpaces;
+	/** This list (whose access must be synchronized) accumulates virtual text (anchors) generated by redirects.
+	 * It is filled when meeting redirect pages, and it is emptied at the first non-redirect page (the page in which the list
+	 * is emptied is immaterial). Note that because of this setup, if there are some redirect 
+	 * pages that are not followed by any indexed page the anchors of those redirects won't be processed at all. 
+	 * If this is a problem, just add a fake empty page at the end. */
+	private final ObjectArrayList<Anchor> redirectAnchors = new ObjectArrayList<Anchor>();
+	
+	public static enum MetadataKeys {
+		ID,
+		LASTEDIT,
+		CATEGORY,
+		FIRSTPAR,
+		/** This key is used internally by {@link WikipediaHeaderFactory} and is associated with the list of redirect anchors. */
+		REDIRECT 
+	};
+
+	/** A factory responsible for special Wikipedia fields (see the {@linkplain NamespacedWikipediaDocumentSequence class documentation}). It
+	 * will be {@linkplain CompositeDocumentFactory composed} with an {@link HtmlDocumentFactory}. */
+	public static final class WikipediaHeaderFactory extends AbstractDocumentFactory {
+		private static final long serialVersionUID = 1L;
+		private static final Object2IntOpenHashMap<String> FIELD_2_INDEX = new Object2IntOpenHashMap<String>( new String[] { "title", "id", "lastedit", "category", "firstpar", "redirect" }, new int[] { 0, 1, 2, 3, 4, 5 } );
+		static {
+			FIELD_2_INDEX.defaultReturnValue( -1 );
+		}
+
+		private final WordReader wordReader = new FastBufferedReader();
+		
+		@Override
+		public int numberOfFields() {
+			return 6;
+		}
+
+		@Override
+		public String fieldName( int field ) {
+			switch( field ) {
+			case 0: return "title";
+			case 1: return "id";
+			case 2: return "lastedit";
+			case 3: return "category";
+			case 4: return "firstpar";
+			case 5: return "redirect";
+			default: throw new IllegalArgumentException();
+			}
+		}
+
+		@Override
+		public int fieldIndex( String fieldName ) {
+			return FIELD_2_INDEX.getInt( fieldName );
+		}
+
+		@Override
+		public FieldType fieldType( int field ) {
+			switch( field ) {
+			case 0: return FieldType.TEXT;
+			case 1: return FieldType.INT;
+			case 2: return FieldType.DATE;
+			case 3: return FieldType.TEXT;
+			case 4: return FieldType.TEXT;
+			case 5: return FieldType.VIRTUAL;
+			default: throw new IllegalArgumentException();
+			}
+		}
+
+		@Override
+		public Document getDocument( final InputStream unusedRawContent, final Reference2ObjectMap<Enum<?>, Object> metadata ) throws IOException {
+			return new AbstractDocument() {
+				
+				@Override
+				public WordReader wordReader( int field ) {
+					return wordReader; // Fixed, for the time being.
+				}
+				
+				@Override
+				public CharSequence uri() {
+					return (CharSequence)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.URI );
+				}
+				
+				@Override
+				public CharSequence title() {
+					return (CharSequence)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.TITLE );
+				}
+				
+				@Override
+				public Object content( final int field ) throws IOException {
+					switch( field ) {
+					case 0: return new FastBufferedReader( (MutableString)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.TITLE ) );
+					case 1: return metadata.get( MetadataKeys.ID );
+					case 2: return metadata.get( MetadataKeys.LASTEDIT );
+					case 3: return new FastBufferedReader( (MutableString)metadata.get( MetadataKeys.CATEGORY ) );
+					case 4: return new FastBufferedReader( (MutableString)metadata.get( MetadataKeys.FIRSTPAR ) );
+					case 5: 
+						@SuppressWarnings("unchecked")
+						final ObjectArrayList<Anchor> redirectAnchors = (ObjectArrayList<Anchor>)metadata.get( MetadataKeys.REDIRECT );
+						ImmutableList<Anchor> result;
+						
+						synchronized( redirectAnchors ) {
+							redirectAnchors.add( new Anchor( (MutableString)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.URI ), (MutableString)metadata.get( PropertyBasedDocumentFactory.MetadataKeys.TITLE ) ) );
+							result = ImmutableList.copyOf( redirectAnchors );
+							redirectAnchors.clear();
+						}
+						// System.err.println( "Adding " + result );
+						return result;
+					default: throw new IllegalArgumentException();
+					}
+				}
+			};
+		}
+
+		@Override
+		public DocumentFactory copy() {
+			return new WikipediaHeaderFactory();
+		}
+		
+	}
+
+	/** Builds a new Wikipedia document sequence that discards namespaced pages.
+	 * 
+	 * @param file the file containing the Wikipedia dump.
+	 * @param bzipped whether {@code file} is compressed with <samp>bzip2</samp>.
+	 * @param baseURL a base URL for links (e.g., for the English Wikipedia, <samp>http://en.wikipedia.org/wiki/</samp>);
+	 * note that if it is nonempty this string <strong>must</strong> terminate with a slash.
+	 * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning
+	 * phases to speed up the scanning process).
+	 */
+	public NamespacedWikipediaDocumentSequence( final String file, final boolean bzipped, final String baseURL, final boolean parseText) {
+		this( file, bzipped, baseURL, parseText, false );
+	}
+	
+	/** Builds a new Wikipedia document sequence using default anchor settings.
+	 * 
+	 * @param file the file containing the Wikipedia dump.
+	 * @param bzipped whether {@code file} is compressed with <samp>bzip2</samp>.
+	 * @param baseURL a base URL for links (e.g., for the English Wikipedia, <samp>http://en.wikipedia.org/wiki/</samp>);
+	 * note that if it is nonempty this string <strong>must</strong> terminate with a slash.
+	 * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning
+	 * phases to speed up the scanning process).
+	 * @param keepNamespaced whether to keep namespaced pages (e.g., <samp>Template:<var>something</var></samp> pages).
+	 */
+	public NamespacedWikipediaDocumentSequence( final String file, final boolean bzipped, final String baseURL, final boolean parseText, final boolean keepNamespaced) {
+		this( file, bzipped, baseURL, parseText, keepNamespaced, 8, 8, 8);
+	}
+
+	/** Builds a new Wikipedia document sequence.
+	 * 
+	 * @param file the file containing the Wikipedia dump.
+	 * @param bzipped whether {@code file} is compressed with <samp>bzip2</samp>.
+	 * @param baseURL a base URL for links (e.g., for the English Wikipedia, <samp>http://en.wikipedia.org/wiki/</samp>);
+	 * note that if it is nonempty this string <strong>must</strong> terminate with a slash.
+	 * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning
+	 * phases to speed up the scanning process).
+	 * @param keepNamespaced whether to keep namespaced pages (e.g., <samp>Template:<var>something</var></samp> pages).
+	 * @param maxPreAnchor maximum number of character before an anchor.
+	 * @param maxAnchor maximum number of character in an anchor.
+	 * @param maxPostAnchor maximum number of characters after an anchor.
+	 */
+	public NamespacedWikipediaDocumentSequence( final String file, final boolean bzipped, final String baseURL, final boolean parseText, final boolean keepNamespaced, final int maxPreAnchor, final int maxAnchor, final int maxPostAnchor) {
+		this.wikipediaXmlDump = file;
+		this.bzipped = bzipped;
+		this.baseURL = baseURL;
+		this.parseText = parseText;
+		this.keepNamespaced = keepNamespaced;
+		Reference2ObjectOpenHashMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>(
+				new Enum[] { HtmlDocumentFactory.MetadataKeys.MAXPREANCHOR, HtmlDocumentFactory.MetadataKeys.MAXANCHOR, HtmlDocumentFactory.MetadataKeys.MAXPOSTANCHOR },
+				new Integer[] { Integer.valueOf( maxPreAnchor ), Integer.valueOf( maxAnchor ), Integer.valueOf( maxPostAnchor ) }
+				);
+		DocumentFactory htmlDocumentFactory = new HtmlDocumentFactory(metadata);
+		this.factory = CompositeDocumentFactory.getFactory( new DocumentFactory[] { new WikipediaHeaderFactory(), htmlDocumentFactory }, new String[] { "title", "id", "lastedit", "category", "firstpar", "redirect", "text", "dummy", "anchor" } );
+		linkBaseURL = baseURL + "${title}";
+		imageBaseURL = baseURL + "${image}";
+	}
+
+	/** A string-based constructor to be used with an {@link ObjectParser}.
+	 *
+	 * @see #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean)
+	 */
+	public NamespacedWikipediaDocumentSequence( final String file, final String bzipped, final String baseURL, final String parseText ) {
+		this( file, Boolean.parseBoolean( bzipped ), baseURL, Boolean.parseBoolean( parseText ) );
+	}
+
+	/** A string-based constructor to be used with an {@link ObjectParser}.
+	 *
+	 * @see #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean)
+	 */
+	public NamespacedWikipediaDocumentSequence( final String file, final String bzipped, final String baseURL, final String parseText, final String keepNamespaced ) {
+		this( file, Boolean.parseBoolean( bzipped ), baseURL, Boolean.parseBoolean( parseText ), Boolean.parseBoolean( keepNamespaced ) );
+	}
+
+	/** A string-based constructor to be used with an {@link ObjectParser}.
+	 *
+	 * @see #NamespacedWikipediaDocumentSequence(String, boolean, String, boolean, boolean, int, int, int)
+	 */
+	public NamespacedWikipediaDocumentSequence( final String file, final String bzipped, final String baseURL, final String parseText, final String keepNamespaced, final String maxBeforeAnchor, final String maxAnchor, final String maxPostAnchor ) {
+		this( file, Boolean.parseBoolean( bzipped ), baseURL, Boolean.parseBoolean( parseText ), Boolean.parseBoolean( keepNamespaced ), Integer.parseInt( maxBeforeAnchor ), Integer.parseInt(  maxAnchor ), Integer.parseInt( maxPostAnchor ) );
+	}
+	
+	private static final class DocumentAndFactory {
+		public final Document document;
+		public final DocumentFactory factory;
+
+		public DocumentAndFactory( final Document document, final DocumentFactory documentFactory ) {
+			this.document = document;
+			this.factory = documentFactory;
+		}
+	}
+
+	public boolean isATrueNamespace(final String stringBeforeColumn) {
+		return nameSpaces.contains( stringBeforeColumn.toLowerCase() );
+	}
+	
+	public boolean isATrueNamespace(final MutableString stringBeforeColumn) {
+		return nameSpaces.contains( stringBeforeColumn.toLowerCase() );
+	}
+	
+	@Override
+	public DocumentIterator iterator() throws IOException {
+		final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+		saxParserFactory.setNamespaceAware( true );
+		final MutableString nameSpaceAccumulator = new MutableString();
+		final ObjectOpenHashSet<MutableString> nameSpacesAccumulator = new ObjectOpenHashSet<MutableString>();
+		final ArrayBlockingQueue<DocumentFactory> freeFactories = new ArrayBlockingQueue<DocumentFactory>( 16 );
+		for( int i = freeFactories.remainingCapacity(); i-- != 0; ) freeFactories.add( this.factory.copy() );
+		final ArrayBlockingQueue<DocumentAndFactory> readyDocumentsAndFactories = new ArrayBlockingQueue<DocumentAndFactory>( freeFactories.size() );
+		
+	    final SAXParser parser;
+		try {
+			parser = saxParserFactory.newSAXParser();
+		}
+		catch ( Exception e ) {
+			throw new RuntimeException( e.getMessage(), e );
+		}
+	    final DefaultHandler handler = new DefaultHandler() {
+			private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+			private boolean inText;
+			private boolean inTitle;
+			private boolean inId;
+			private boolean inTimestamp;
+			private boolean inNamespaceDef;
+			private boolean redirect;
+			private MutableString text = new MutableString();
+			private MutableString title = new MutableString();
+			private MutableString id = new MutableString();
+			private MutableString timestamp = new MutableString();
+			private final Reference2ObjectMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>();
+			{
+				metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8" );
+				metadata.put( MetadataKeys.REDIRECT, redirectAnchors );
+			}
+
+			@Override
+			public void startElement( String uri, String localName, String qName, Attributes attributes ) throws SAXException {
+				if ( "page".equals( localName ) ) {
+					redirect = inText = inTitle = inId = inTimestamp = false;
+					text.length( 0 );
+					title.length( 0 );
+					id.length( 0 );
+					timestamp.length( 0 );
+				}
+				else if ( "text".equals( localName ) ) inText = true;
+				else if ( "title".equals( localName ) && title.length() == 0 ) inTitle = true; // We catch only the first id/title elements.
+				else if ( "id".equals( localName ) && id.length() ==0  ) inId = true;
+				else if ( "timestamp".equals( localName ) && timestamp.length() ==0  ) inTimestamp = true;
+				else if ( "redirect".equals( localName ) ) {
+					redirect = true;
+					if ( attributes.getValue( "title" ) != null )
+						// Accumulate the title of the page as virtual text of the redirect page.
+						synchronized ( redirectAnchors ) {
+							final String link = Encoder.encodeTitleToUrl( attributes.getValue( "title" ), true );
+							redirectAnchors.add( new AnchorExtractor.Anchor( new MutableString( baseURL.length() + link.length() ).append( baseURL ).append( link ), title.copy() ) );
+						}
+				}
+				else if ( "namespace".equals( localName ) ) {
+					// Found a new namespace
+					inNamespaceDef = true;
+					nameSpaceAccumulator.length( 0 );
+				}
+			}
+
+			@Override
+			public void endElement( String uri, String localName, String qName ) throws SAXException {
+				if ( "namespace".equals( localName ) ) { // Collecting a namespace
+					if ( nameSpaceAccumulator.length() != 0 ) nameSpacesAccumulator.add( nameSpaceAccumulator.copy().toLowerCase() );
+					return;
+				}
+
+				if ( "namespaces".equals( localName ) ) { // All namespaces collected
+					nameSpaces = ImmutableSet.copyOf( nameSpacesAccumulator );
+					return;
+				}
+
+				if ( ! redirect ) {
+					if ( "title".equals( localName ) ) {
+						// Set basic metadata for the page
+						metadata.put( PropertyBasedDocumentFactory.MetadataKeys.TITLE, title.copy() );
+						String link = Encoder.encodeTitleToUrl( title.toString(), true );
+						metadata.put( PropertyBasedDocumentFactory.MetadataKeys.URI, new MutableString( baseURL.length() + link.length() ).append( baseURL ).append( link ) );
+						inTitle = false;
+					}
+					else if ( "id".equals( localName ) ) {
+						metadata.put( MetadataKeys.ID, Long.valueOf( id.toString() ) );
+						inId = false;
+					}
+					else if ( "timestamp".equals( localName ) ) {
+						try {
+							metadata.put( MetadataKeys.LASTEDIT, dateFormat.parse( timestamp.toString() ) );
+						}
+						catch ( ParseException e ) {
+							throw new RuntimeException( e.getMessage(), e );
+						}
+						inTimestamp = false;
+					}
+					else if ( "text".equals( localName ) ) {
+						inText = false;
+						if ( ! keepNamespaced )  {
+							// Namespaces are case-insensitive and language-dependent
+							final int pos = title.indexOf( ':' );
+							if ( pos != -1 && isATrueNamespace(title.substring( 0, pos )) ) return;
+						}
+						try {
+							final MutableString html = new MutableString();
+							DocumentFactory freeFactory;
+							try {
+								freeFactory = freeFactories.take();
+							}
+							catch ( InterruptedException e ) {
+								throw new RuntimeException( e.getMessage(), e );
+							}
+							if ( parseText ) {
+								if ( DISAMBIGUATION.search( text ) != -1 ) { // It's a disambiguation page.
+									/* Roi's hack: duplicate links using the page title, so the generic name will end up as anchor text. */
+									final MutableString newLinks = new MutableString();
+									for( int start = 0, end; ( start = BRACKETS_OPEN.search( text, start ) ) != -1; start = end ) {
+										end = start;
+										final int endOfLink = text.indexOfAnyOf( END_OF_DISAMBIGUATION_LINK, start );
+										// Note that we don't escape title because we are working at the Wikipedia raw text level.
+										if ( endOfLink != -1 ) {
+											newLinks.append( text.array(), start, endOfLink - start ).append( '|' ).append( title ).append( "]]\n" );
+											end = endOfLink;
+										}
+										end++;
+									}
+									
+									text.append( newLinks );
+								}
+								// We separate categories by OXOXO, so we don't get overflowing phrases.
+								final MutableString category = new MutableString();
+								for( int start = 0, end; ( start = CATEGORY_START.search( text, start ) ) != -1; start = end ) {
+									end = BRACKETS_CLOSED.search( text, start += CATEGORY_START.length() );
+									if ( end != -1 ) category.append( text.subSequence( start,  end ) ).append( " OXOXO " );
+									else break;
+								}
+								metadata.put( MetadataKeys.CATEGORY, category );
+								
+								// Heuristics to get the first paragraph
+								metadata.put( MetadataKeys.FIRSTPAR, new MutableString() );
+								String plainText = new WikiModel( imageBaseURL, linkBaseURL ).render( new PlainTextConverter( true ), text.toString() );
+								for( int start = 0; start < plainText.length(); start++ ) {
+									//System.err.println("Examining " + plainText.charAt( start )  );
+									if ( Character.isWhitespace( plainText.charAt( start ) ) ) continue;
+									if ( plainText.charAt( start ) == '{' ) {
+										//System.err.print( "Braces " + start + " text: \"" + plainText.subSequence( start, start + 10 )  + "\" -> " );
+										start = BRACES_CLOSED.search( plainText, start );
+										//System.err.println( start + " text: \"" + plainText.subSequence( start, start + 10 ) + "\"" );
+										if ( start == -1 ) break;
+										start++;
+									}
+									else if ( plainText.charAt( start ) == '[' ) {
+										start = BRACKETS_CLOSED.search( plainText, start );
+										if ( start == -1 ) break;
+										start++;
+									}
+									else {
+										final int end = plainText.indexOf( '\n', start );
+										if ( end != -1 ) metadata.put( MetadataKeys.FIRSTPAR, new MutableString( plainText.substring( start, end ) ) );//new MutableString( new WikiModel( imageBaseURL, linkBaseURL ).render( new PlainTextConverter( true ), text.substring( start, end ).toString() ) ) );
+										break;
+									}
+								}
+								
+								try {
+									WikiModel wikiModel = new WikiModel( imageBaseURL, linkBaseURL );
+									wikiModel.render( new HTMLConverter(), text.toString(), html, false, false );
+									final Map<String, String> categories = wikiModel.getCategories();
+									// Put back category links in the page (they have been parsed by bliki and to not appear anymore in the HTML rendering)
+									for( Entry<String, String> entry: categories.entrySet() ) {
+										final String key = entry.getKey();
+										final String value = entry.getValue().trim();
+										if ( value.length() != 0 ) // There are empty such things
+											html.append( "\n<a href=\"" ).append( baseURL ).append( "Category:" ).append( Encoder.encodeTitleToUrl( key, true ) ).append(  "\">" ).append( HtmlEscapers.htmlEscaper().escape( key ) ).append( "</a>\n" );
+									}
+								}
+								catch( Exception e ) {
+									LOGGER.error( "Unexpected exception while parsing " + title, e );
+								}
+							}
+							readyDocumentsAndFactories.put( new DocumentAndFactory( freeFactory.getDocument( IOUtils.toInputStream( html, Charsets.UTF_8 ), new Reference2ObjectOpenHashMap<Enum<?>, Object>( metadata ) ), freeFactory ) );
+						}
+						catch ( InterruptedException e ) {
+							throw new RuntimeException( e.getMessage(), e );
+						}
+						catch ( IOException e ) {
+							throw new RuntimeException( e.getMessage(), e );
+						}
+					}
+				}
+			}
+
+			@Override
+			public void characters( char[] ch, int start, int length ) throws SAXException {
+				if ( inText && parseText ) text.append( ch, start, length );
+				if ( inTitle ) title.append( ch, start, length );
+				if ( inId ) id.append( ch, start, length );
+				if ( inTimestamp ) timestamp.append( ch, start, length );
+				if ( inNamespaceDef ) {
+					nameSpaceAccumulator.append( ch, start, length );
+					inNamespaceDef = false; // Dirty, but it works
+				}
+			}
+
+			@Override
+			public void ignorableWhitespace( char[] ch, int start, int length ) throws SAXException {
+				if ( inText && parseText ) text.append( ch, start, length );
+				if ( inTitle ) title.append( ch, start, length );
+			}
+	    };
+
+	    final Thread parsingThread = new Thread() {
+	    	public void run() {
+	    		try {
+					InputStream in = new FileInputStream( wikipediaXmlDump );
+					if ( bzipped ) in = new BZip2CompressorInputStream( in );
+					parser.parse( new InputSource( new InputStreamReader( new FastBufferedInputStream( in ), Charsets.UTF_8  ) ), handler );
+					readyDocumentsAndFactories.put( END );
+				}
+				catch ( Exception e ) {
+					throw new RuntimeException( e.getMessage(), e );
+				}
+	    	}
+	    };
+	    
+	    parsingThread.start();
+
+	    return new AbstractDocumentIterator() {
+	    	private DocumentFactory lastFactory;
+			@Override
+			public Document nextDocument() throws IOException {
+				try {
+					final DocumentAndFactory documentAndFactory = readyDocumentsAndFactories.take();
+					if ( lastFactory != null ) freeFactories.put( lastFactory );
+					if ( documentAndFactory == END ) return null;
+					lastFactory = documentAndFactory.factory;
+					return documentAndFactory.document;
+				}
+				catch ( InterruptedException e ) {
+					throw new RuntimeException( e.getMessage(), e );
+				}
+			}
+		};
+	}
+
+	@Override
+	public DocumentFactory factory() {
+		return factory;
+	}
+
+	/** A wrapper around a signed function that remaps entries exceeding a provided threshold using a specified target array. */
+	public static final class SignedRedirectedStringMap extends AbstractObject2LongFunction<CharSequence> implements StringMap<CharSequence> {
+		private static final long serialVersionUID = 1L;
+		/** The number of documents. */
+		private final long numberOfDocuments;
+		/** A signed function function mapping valid keys to their ordinal position. */
+		private Object2LongFunction<CharSequence> signedFunction;
+		/** The value to be returned for keys whose ordinal position is greater than {@link #numberOfDocuments}. */
+		private final long[] target;
+
+		/** Creates a new signed redirected map.
+		 * 
+		 * @param numberOfDocuments the threshold after which the {@code target} array will be used to compute the output.
+		 * @param signedFunction the base signed function.
+		 * @param target an array providing the output for items beyond {@code numberOfDocuments}; it must be
+		 * long as the size of {@code signedFunction} minus {@code numberOfDocuments}.
+		 */
+		public SignedRedirectedStringMap( final long numberOfDocuments, final Object2LongFunction<CharSequence> signedFunction, final long[] target ) {
+			this.numberOfDocuments = numberOfDocuments;
+			this.signedFunction = signedFunction;
+			this.target = target;
+		}
+		
+		@Override
+		public long getLong( Object key ) {
+			final long index = signedFunction.getLong( key ); 
+			if ( index == -1 ) return -1;
+			if ( index < numberOfDocuments ) return index;
+			return target[ (int)( index - numberOfDocuments ) ];
+		}
+
+		@Override
+		public boolean containsKey( Object key ) {
+			return signedFunction.getLong( key ) != -1;
+		}
+
+		public long size64() {
+			return numberOfDocuments;
+		}
+
+		@Override
+		@Deprecated
+		public int size() {
+			return (int)Math.min( Integer.MAX_VALUE, size64() );
+		}
+
+		@Override
+		public ObjectBigList<? extends CharSequence> list() {
+			return null;
+		}
+	}
+	
+	
+	public static void main( final String arg[] ) throws ParserConfigurationException, SAXException, IOException, JSAPException, ClassNotFoundException {
+		SimpleJSAP jsap = new SimpleJSAP( NamespacedWikipediaDocumentSequence.class.getName(), "Computes the redirects of a Wikipedia dump and integrate them into an existing virtual document resolver for the dump.",
+			new Parameter[] {
+				new Switch( "bzip2", 'b', "bzip2", "The file is compressed with bzip2" ),
+				new Switch( "iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)." ),
+				new FlaggedOption( "width", JSAP.INTEGER_PARSER, Integer.toString( Long.SIZE ), JSAP.NOT_REQUIRED, 'w', "width", "The width, in bits, of the signatures used to sign the function from URIs to their rank." ),
+				new UnflaggedOption( "file", JSAP.STRING_PARSER, JSAP.REQUIRED, "The file containing the Wikipedia dump." ),
+				new UnflaggedOption( "baseURL", JSAP.STRING_PARSER, JSAP.REQUIRED, "The base URL for the collection (e.g., http://en.wikipedia.org/wiki/)." ),
+				new UnflaggedOption( "uris", JSAP.STRING_PARSER, JSAP.REQUIRED, "The URIs of the documents in the collection (generated by ScanMetadata)." ),
+				new UnflaggedOption( "vdr", JSAP.STRING_PARSER, JSAP.REQUIRED, "The name of a precomputed virtual document resolver for the collection." ),
+				new UnflaggedOption( "redvdr", JSAP.STRING_PARSER, JSAP.REQUIRED, "The name of the resulting virtual document resolver." )
+		});
+
+		JSAPResult jsapResult = jsap.parse( arg );
+		if ( jsap.messagePrinted() ) return;
+
+		final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+		saxParserFactory.setNamespaceAware( true );
+		final Object2ObjectOpenHashMap<MutableString, String> redirects = new Object2ObjectOpenHashMap<MutableString, String>();
+		final String baseURL = jsapResult.getString( "baseURL" );
+		final ProgressLogger progressLogger = new ProgressLogger( LOGGER );
+		progressLogger.itemsName = "redirects";
+		progressLogger.start( "Extracting redirects..." );
+		
+	    final SAXParser parser = saxParserFactory.newSAXParser();
+	    final DefaultHandler handler = new DefaultHandler() {
+	    	private boolean inTitle;	    	
+	    	private MutableString title = new MutableString();
+			
+			@Override
+			public void startElement( String uri, String localName, String qName, Attributes attributes ) throws SAXException {
+				if ( "page".equals( localName ) ) {
+					inTitle = false;
+					title.length( 0 );
+				}
+				else if ( "title".equals( localName ) && title.length() == 0 ) inTitle = true; // We catch only the first title element.
+				else if ( "redirect".equals( localName ) && attributes.getValue( "title" ) != null ) {
+					progressLogger.update();
+					redirects.put( title.copy(), attributes.getValue( "title" ) );
+				}
+			}
+
+			@Override
+			public void endElement( String uri, String localName, String qName ) throws SAXException {
+				if ( "title".equals( localName ) ) inTitle = false;
+			}
+
+			@Override
+			public void characters( char[] ch, int start, int length ) throws SAXException {
+				if ( inTitle ) title.append( ch, start, length );
+			}
+
+			@Override
+			public void ignorableWhitespace( char[] ch, int start, int length ) throws SAXException {
+				if ( inTitle ) title.append( ch, start, length );
+			}
+	    };
+		
+		InputStream in = new FileInputStream( jsapResult.getString( "file" ) );
+		if ( jsapResult.userSpecified( "bzip2" ) ) in = new BZip2CompressorInputStream( in );
+		parser.parse( new InputSource( new InputStreamReader( new FastBufferedInputStream( in ), Charsets.UTF_8  ) ), handler );
+		progressLogger.done();
+
+		final Object2LongLinkedOpenHashMap<MutableString> resolved = new Object2LongLinkedOpenHashMap<MutableString>();
+		final VirtualDocumentResolver vdr = (VirtualDocumentResolver)BinIO.loadObject( jsapResult.getString( "vdr" ) );
+
+		progressLogger.expectedUpdates = redirects.size();
+		progressLogger.start( "Examining redirects..." );
+
+		for( Map.Entry<MutableString,String> e: redirects.entrySet() ) {
+			final MutableString start = new MutableString().append( baseURL ).append( Encoder.encodeTitleToUrl( e.getKey().toString(), true ) );
+			final MutableString end = new MutableString().append( baseURL ).append( Encoder.encodeTitleToUrl( e.getValue(), true ) );
+			final long s = vdr.resolve( start );
+			if ( s == -1 ) {
+				final long t = vdr.resolve( end );
+				if ( t != -1 ) resolved.put( start.copy(), t );
+				else LOGGER.warn( "Failed redirect: " + start + " -> " + end );
+			}
+			else LOGGER.warn( "URL " + start + " is already known to the virtual document resolver" );			
+			
+			progressLogger.lightUpdate();
+		}
+		
+		progressLogger.done();
+		
+		//System.err.println(resolved);
+		
+		final Iterable<MutableString> allURIs = Iterables.concat( new FileLinesCollection( jsapResult.getString( "uris" ), "UTF-8" ), resolved.keySet() );
+		final long numberOfDocuments = vdr.numberOfDocuments();
+		
+		final TransformationStrategy<CharSequence> transformationStrategy = jsapResult.userSpecified( "iso" ) 
+				? TransformationStrategies.iso()
+				: TransformationStrategies.utf16();
+		
+		BinIO.storeObject( 
+			new URLMPHVirtualDocumentResolver( 
+				new SignedRedirectedStringMap( numberOfDocuments,
+					new ShiftAddXorSignedStringMap( allURIs.iterator(), new MWHCFunction.Builder<CharSequence>().keys( allURIs ).transform( transformationStrategy ).build(), jsapResult.getInt( "width" ) ),
+						resolved.values().toLongArray() ) ), jsapResult.getString( "redvdr" ) );
+	}
+}
diff --git a/java/src/it/unimi/di/wikipedia/parsing/WikipediaCategoryProducer.java b/java/src/it/unimi/di/wikipedia/parsing/WikipediaCategoryProducer.java
new file mode 100644
index 0000000..2a84cfa
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/parsing/WikipediaCategoryProducer.java
@@ -0,0 +1,232 @@
+package it.unimi.di.wikipedia.parsing;
+
+import it.unimi.di.big.mg4j.document.Document;
+import it.unimi.di.big.mg4j.document.DocumentIterator;
+import it.unimi.di.wikipedia.utils.IntMapGraph;
+import it.unimi.di.wikipedia.utils.MapUtils;
+import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
+import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+import it.unimi.dsi.fastutil.ints.IntSet;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.objects.Object2IntMap;
+import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.webgraph.BVGraph;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.Reader;
+import java.util.Scanner;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.output.NullOutputStream;
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.martiansoftware.jsap.JSAP;
+import com.martiansoftware.jsap.JSAPResult;
+import com.martiansoftware.jsap.Parameter;
+import com.martiansoftware.jsap.SimpleJSAP;
+import com.martiansoftware.jsap.Switch;
+import com.martiansoftware.jsap.UnflaggedOption;
+
+public class WikipediaCategoryProducer {
+	
+	public static Logger LOGGER = LoggerFactory.getLogger(WikipediaCategoryProducer.class);
+	
+	private static final int CATEGORY_NAME_INDEX = "Category:".length();
+	private static final int UNSEEN_CATEGORY = -1;
+	private static final int CATEGORY_FIELD = 3;
+	private static final String SEPARATOR_REGEX = "OXOXO";
+	private static final int ESITMATED_NUM_OF_PAGES = 6721260; // this is needed only to log progress 
+
+
+
+
+	private final NamespacedWikipediaDocumentSequence wikipediaDocumentSequence;
+	private final Int2ObjectMap<String> pageId2Name;
+	private final Object2IntMap<String> catName2Id;
+	private final Int2ObjectMap<IntSet> page2cat;
+	private final Int2ObjectOpenHashMap<IntSet> cat2cat;
+	private PrintStream plainUrisFile = new PrintStream(NullOutputStream.NULL_OUTPUT_STREAM);
+	
+	private int nextPageId;
+	private int nextCategoryId;
+
+	WikipediaCategoryProducer(NamespacedWikipediaDocumentSequence wds) {
+		this.wikipediaDocumentSequence = wds;
+		catName2Id = new Object2IntOpenHashMap<String>();
+		catName2Id.defaultReturnValue(UNSEEN_CATEGORY);
+		pageId2Name = new Int2ObjectOpenHashMap<String>();
+		page2cat = new Int2ObjectOpenHashMap<IntSet>();
+		cat2cat = new Int2ObjectOpenHashMap<IntSet>();
+		nextPageId = 0;
+		nextCategoryId = 0;
+	}
+	
+
+	private static enum Namespace {
+		ARTICLE, OTHER, CATEGORY;
+	}
+	
+	private Namespace getNamespace(String title) {
+		int pos = title.indexOf(':');
+		if (pos < 0) return Namespace.ARTICLE;
+		String namespace = title.substring(0, pos);
+		return (namespace.toLowerCase().equals("category")) ?
+			Namespace.CATEGORY
+		: (
+			(wikipediaDocumentSequence.isATrueNamespace(namespace)) ?
+				Namespace.OTHER
+			:	Namespace.ARTICLE
+		);
+	}
+	
+	private int getCategoryId(String category) {
+		int categoryId = catName2Id.getInt(category);
+		if (categoryId == UNSEEN_CATEGORY) {
+			categoryId = this.nextCategoryId++;
+			catName2Id.put(category, categoryId);
+		}
+		return categoryId;
+	}
+	
+	private IntSet parseCategories(Document wikiPage) throws IOException {
+			String categoryString = IOUtils.toString((Reader) wikiPage.content(CATEGORY_FIELD));
+			IntSet categoryIds = new IntOpenHashSet();
+			int pipeIndex;
+			
+			for (String category : categoryString.split(SEPARATOR_REGEX)) {
+				if ((pipeIndex = category.indexOf('|')) > -1)
+					category = category.substring(0, pipeIndex);
+				
+				category = StringUtils.strip(category);
+				if (category.length() > 0)
+					categoryIds.add(getCategoryId(category));
+			}
+			
+			return categoryIds;
+	}
+
+	@SuppressWarnings("resource") // the warning on wikiPage is false.
+	public void extractAllData() throws IOException {
+		DocumentIterator wikiPagesIterator = wikipediaDocumentSequence.iterator();
+		Document wikiPage;
+		String title;
+		
+		ProgressLogger pl = new ProgressLogger(LOGGER, "pages");
+		pl.expectedUpdates = ESITMATED_NUM_OF_PAGES; 
+		pl.info = new Object() {
+			public String toString() {return catName2Id.size() + " categories found.";}
+		};
+		pl.start("Starting to iterate all the pages in the XML file...");
+		
+		// iterating pages
+		while ((wikiPage = wikiPagesIterator.nextDocument()) != null) {
+			switch (getNamespace(title = wikiPage.title().toString())) {
+				case CATEGORY:
+					cat2cat.put(
+							getCategoryId(title.substring(CATEGORY_NAME_INDEX)),
+							parseCategories(wikiPage)
+							);
+					break;
+				case ARTICLE:
+					int pageId = nextPageId++;
+					pageId2Name.put(pageId, title);
+					
+					page2cat.put(pageId, 
+							parseCategories(wikiPage)
+							);
+					plainUrisFile.println(wikiPage.uri());
+					
+					break;
+				default:
+					break;
+			}
+			
+			wikiPage.close();
+			pl.update();
+			
+		}
+		pl.done();
+		
+		wikipediaDocumentSequence.close();
+		wikiPagesIterator.close();
+	}
+	
+	private void setPlainUrisFile(String path) throws FileNotFoundException {
+		this.plainUrisFile = new PrintStream(new File(path));
+	}
+	
+	private void saveAllTo(String basename) throws Exception {
+		try {
+			BinIO.storeObject(pageId2Name, basename + "pageId2Name.ser");
+			BinIO.storeObject(catName2Id, basename + "catName2Id.ser");
+			BinIO.storeObject(page2cat, basename + "page2cat.ser");
+			
+			if (cat2cat.isEmpty()) LOGGER.error("THE PARSING DID NOT FIND ANY CATEGORY PSEUDOTREE");
+			else BVGraph.store(new IntMapGraph(cat2cat), "categoryPseudotree");
+			
+			BinIO.storeObject(MapUtils.invert(catName2Id), basename + "catId2Name.ser");
+		} catch (IOException e) {
+			LOGGER.error("Cannot save something :( :(", e);
+		}
+		this.plainUrisFile.close();
+	}
+	
+	static String askForString(String s) {
+		System.out.println(s);
+		Scanner scanner = new Scanner(System.in);
+		String nextLine = scanner.nextLine();
+		scanner.close();
+		return nextLine;
+	}
+	
+	public static void main(String[] rawArguments) throws Exception {
+		
+		SimpleJSAP jsap = new SimpleJSAP(
+				WikipediaCategoryProducer.class.getName(),
+				"Read a wikipedia dump and produces these files as " +
+				"serialized Java objects: \n" +
+				" * pageId2Name.ser, an Int2ObjectMap from page ids to " +
+				"wikipedia page names \n" +
+				" * catId2Name.ser, an Object2IntMap from category ids to " +
+				"category names \n" +
+				" * page2cat.ser, an Int2ObjectMap from page ids to an IntSet" +
+				"of category ids \n" + 
+				" * categoryPseudotree.graph, the Wikipedia Category Hierarchy "
+				+ "in BVGraph format",
+				new Parameter[] {
+			new UnflaggedOption( "input", JSAP.STRING_PARSER, JSAP.REQUIRED,
+						"The pages-articles.xml input file, from Wikipedia." ),
+			new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED,
+						"The basename of the output files (p.e. a Directory with / in the end)" ),
+			new Switch("bzip", 'z', "bzip", "Interpret the input file as bzipped"),
+			new Switch("verbose", 'v', "verbose", "Print every category found to StdErr") 
+		});
+		
+		// Initializing input read
+		JSAPResult args = jsap.parse( rawArguments );
+		if ( jsap.messagePrinted() ) System.exit( 1 );
+		
+		NamespacedWikipediaDocumentSequence wikipediaDocumentSequence = new NamespacedWikipediaDocumentSequence(
+				args.getString("input"),
+				args.getBoolean("bzip"),
+				"http://en.wikipedia.org/wiki/", true,
+				 true // keep all namespaces
+			);
+		WikipediaCategoryProducer reader =
+				new WikipediaCategoryProducer(wikipediaDocumentSequence);
+		reader.setPlainUrisFile(args.getString("basename") + "pages.uris");
+		reader.extractAllData();
+		reader.saveAllTo(args.getString("basename"));
+		wikipediaDocumentSequence.close();
+		
+	}
+
+
+}
diff --git a/java/src/it/unimi/di/wikipedia/utils/IntMapGraph.java b/java/src/it/unimi/di/wikipedia/utils/IntMapGraph.java
new file mode 100644
index 0000000..7956105
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/utils/IntMapGraph.java
@@ -0,0 +1,138 @@
+package it.unimi.di.wikipedia.utils;
+
+import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
+import it.unimi.dsi.fastutil.ints.Int2ObjectMap.Entry;
+import it.unimi.dsi.fastutil.ints.IntArrays;
+import it.unimi.dsi.fastutil.ints.IntSet;
+import it.unimi.dsi.fastutil.ints.IntSets;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.webgraph.BVGraph;
+import it.unimi.dsi.webgraph.ImmutableGraph;
+import it.unimi.dsi.webgraph.LazyIntIterator;
+import it.unimi.dsi.webgraph.LazyIntIterators;
+import it.unimi.dsi.webgraph.ScatteredArcsASCIIGraph;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.martiansoftware.jsap.FlaggedOption;
+import com.martiansoftware.jsap.JSAP;
+import com.martiansoftware.jsap.JSAPException;
+import com.martiansoftware.jsap.JSAPResult;
+import com.martiansoftware.jsap.Parameter;
+import com.martiansoftware.jsap.SimpleJSAP;
+import com.martiansoftware.jsap.UnflaggedOption;
+
+public class IntMapGraph extends ImmutableGraph {
+	public static Logger LOGGER = LoggerFactory.getLogger(IntMapGraph.class);
+	
+	public final Int2ObjectMap<IntSet> map;
+	private final int numNodes, numArcs;
+	
+	public IntMapGraph(Int2ObjectMap<IntSet> map) {
+		this.map = map;
+		if (map.defaultReturnValue() == null || !map.defaultReturnValue().equals(IntSets.EMPTY_SET)) {
+			LOGGER.warn("It is necessary to set default return value of the map as the empty set.");
+			map.defaultReturnValue(IntSets.EMPTY_SET);
+		}
+		
+		int maxNodeIndex = 0, numArcs = 0;
+		for (Entry<IntSet> x : map.int2ObjectEntrySet()) {
+			if (x.getIntKey() > maxNodeIndex)
+				maxNodeIndex = x.getIntKey();
+			for (int succ : x.getValue()) {
+				if (succ > maxNodeIndex)
+					maxNodeIndex = succ;
+				numArcs++;
+			}
+		}
+		
+		this.numArcs  = numArcs;
+		this.numNodes = maxNodeIndex+1;
+	}
+
+	@Override
+	public int numNodes() {
+		return numNodes;
+	}
+
+	@Override
+	public boolean randomAccess() {
+		return true;
+	}
+
+	@Override
+	public int outdegree(int x) {
+		return map.get(x).size();
+	}
+	
+	@Override
+	public long numArcs() { 
+		return numArcs;
+	}
+	
+	@Override
+	public int[] successorArray( final int x ) { 
+		int[] succ = map.get(x).toIntArray();
+		IntArrays.quickSort(succ);
+		return succ;
+	}
+	
+	@Override
+	public LazyIntIterator successors( final int x ) { 
+		return LazyIntIterators.wrap( successorArray(x) );
+	}
+
+
+	@Override
+	public ImmutableGraph copy() {
+		throw new UnsupportedOperationException();
+	}
+	
+	@SuppressWarnings("unchecked")
+	public static void main( String args[] ) throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException  {
+		String basename;
+		SimpleJSAP jsap = new SimpleJSAP( ScatteredArcsASCIIGraph.class.getName(), "Converts a int2intset fastutil map into a BVGraph.",
+				new Parameter[] {
+						new UnflaggedOption( "map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The serialized Int2ObjectMap<IntSet>" ),	
+						new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
+						new FlaggedOption( "comp", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag (may be specified several times)." ).setAllowMultipleDeclarations( true ),
+						new FlaggedOption( "windowSize", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_WINDOW_SIZE ), JSAP.NOT_REQUIRED, 'w', "window-size", "Reference window size (0 to disable)." ),
+						new FlaggedOption( "maxRefCount", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_MAX_REF_COUNT ), JSAP.NOT_REQUIRED, 'm', "max-ref-count", "Maximum number of backward references (-1 for ∞)." ),
+						new FlaggedOption( "minIntervalLength", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_MIN_INTERVAL_LENGTH ), JSAP.NOT_REQUIRED, 'i', "min-interval-length", "Minimum length of an interval (0 to disable)." ),
+						new FlaggedOption( "zetaK", JSAP.INTEGER_PARSER, String.valueOf( BVGraph.DEFAULT_ZETA_K ), JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes." ),
+						new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the output graph" ),
+					}
+				);
+				
+		JSAPResult jsapResult = jsap.parse( args );
+		if ( jsap.messagePrinted() ) System.exit( 1 );
+		
+		basename = jsapResult.getString( "basename" );
+
+		int flags = 0;
+		for( String compressionFlag: jsapResult.getStringArray( "comp" ) ) {
+			try {
+				flags |= BVGraph.class.getField( compressionFlag ).getInt( BVGraph.class );
+			}
+			catch ( Exception notFound ) {
+				throw new JSAPException( "Compression method " + compressionFlag + " unknown." );
+			}
+		}
+		
+		final int windowSize = jsapResult.getInt( "windowSize" );
+		final int zetaK = jsapResult.getInt( "zetaK" );
+		int maxRefCount = jsapResult.getInt( "maxRefCount" );
+		if ( maxRefCount == -1 ) maxRefCount = Integer.MAX_VALUE;
+		final int minIntervalLength = jsapResult.getInt( "minIntervalLength" );
+		
+		final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), TimeUnit.MILLISECONDS );
+		ImmutableGraph graph = new IntMapGraph((Int2ObjectMap<IntSet>) BinIO.loadObject(jsapResult.getString("map")));
+		BVGraph.store( graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl );
+	}
+
+}
diff --git a/java/src/it/unimi/di/wikipedia/utils/MapUtils.java b/java/src/it/unimi/di/wikipedia/utils/MapUtils.java
new file mode 100644
index 0000000..5ebbf29
--- /dev/null
+++ b/java/src/it/unimi/di/wikipedia/utils/MapUtils.java
@@ -0,0 +1,133 @@
+package it.unimi.di.wikipedia.utils;
+
+import it.unimi.dsi.fastutil.ints.AbstractInt2ObjectFunction;
+import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
+import it.unimi.dsi.fastutil.ints.Int2ObjectFunction;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import it.unimi.dsi.fastutil.objects.AbstractObject2IntFunction;
+import it.unimi.dsi.fastutil.objects.Object2IntFunction;
+import it.unimi.dsi.logging.ProgressLogger;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class MapUtils {
+	public static Logger LOGGER = LoggerFactory.getLogger(MapUtils.class);
+
+	final public static Int2ObjectFunction<String> NUMBER_PRINTER = new AbstractInt2ObjectFunction<String>(){
+		private static final long serialVersionUID = 1L;
+
+		@Override
+		public String get(int key) {
+			return Integer.toString(key);
+		}
+
+		@Override
+		public boolean containsKey(int key) {
+			return true;
+		}
+
+		@Override
+		public int size() {
+			return Integer.MAX_VALUE;
+		}
+		
+	};
+	
+	final public static Object2IntFunction<String> NUMBER_READER = new AbstractObject2IntFunction<String>(){
+		private static final long serialVersionUID = 1L;
+
+		@Override
+		public int getInt(Object key) {
+			if (! (key instanceof String))
+				return defRetValue;
+			try {
+				return Integer.parseInt((String) key);
+			} catch (NumberFormatException e) {
+				return defRetValue;
+			}
+		}
+
+		@Override
+		public boolean containsKey(Object key) {
+			if (! (key instanceof String))
+				return false;
+			else {
+				try {
+					Integer.parseInt((String) key);
+					return true;
+				} catch (NumberFormatException e) {
+					return false;
+				}
+			}
+		}
+
+		@Override
+		public int size() {
+			return Integer.MAX_VALUE;
+		}
+
+		
+	};
+
+	public static Class<?> invertMapType(Class<?> cls) throws ClassNotFoundException {
+		String[] mapType = StringUtils.splitByCharacterTypeCamelCase(cls.getSimpleName());
+		String type1 = mapType[0];
+		if (!mapType[1].equals("2"))
+			throw new IllegalArgumentException(cls + " is not a fastutil map.");
+		String type2 = mapType[2];
+		mapType[0] = type2;
+		mapType[2] = type1;
+		String newType = StringUtils.join(mapType);
+		newType = "it.unimi.dsi.fastutil." + type2.toLowerCase() + "s." + newType;
+		return Class.forName(newType);
+	}
+
+	@SuppressWarnings({ "rawtypes", "unchecked" })
+	public static Map invert(Map inputMap) throws InstantiationException,
+			IllegalAccessException, InvocationTargetException,
+			NoSuchMethodException, ClassNotFoundException {
+		LOGGER.info("Inverting map...");
+		Map outputMap = (Map) invertMapType(inputMap.getClass()).getConstructor(new Class[] {}).newInstance(new Object[] {});
+		
+		ProgressLogger pl = new ProgressLogger(LOGGER, "entries");
+		pl.expectedUpdates = inputMap.size();
+		pl.start();
+		
+		for (Object entryObj : inputMap.entrySet()) {
+			Map.Entry entry = (Map.Entry) entryObj;
+			Object oldValue = outputMap.put(entry.getValue(), entry.getKey());
+			if (oldValue != null)
+				throw new IllegalArgumentException(
+						"The value " + entry.getValue() + " is associated to both '" +
+						oldValue + "' and '" + entry.getKey() + "'. The map is not" +
+						"bijective"
+				);
+			pl.lightUpdate();
+		}
+		pl.done();
+		return outputMap;
+	}
+	
+	public static IntComparator comparatorPuttingLargestMappedValueFirst(final Int2DoubleMap map) {
+		return new IntComparator() {
+			public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); }
+			public int compare(int k1, int k2) {
+				return Double.compare(map.get(k2), map.get(k1));
+			}
+		};
+	}
+	
+	public static IntComparator comparatorPuttingSmallestMappedValueFirst(final Int2DoubleMap map) {
+		return new IntComparator() {
+			public int compare(Integer o1, Integer o2) { return compare(o1.intValue(), o2.intValue()); }
+			public int compare(int k1, int k2) {
+				return Double.compare(map.get(k1), map.get(k2));
+			}
+		};
+	}
+}
diff --git a/page2cat-HEAD.tsv b/page2cat-HEAD.tsv
new file mode 100644
index 0000000..07df109
--- /dev/null
+++ b/page2cat-HEAD.tsv
@@ -0,0 +1,50 @@
+Anarchism	Political ideologies	Social theories	Political culture	Anti-capitalism	Far-left politics	Anarchism	
+Değnek	Villages by country	Regions of Turkey	Populated places in Turkey by province	
+Queensland Conservatorium Griffith University	Universities by country	Entertainment in Australia	Australian capital cities	
+Octagon Chapel, Liverpool	Churches	Buildings and structures in England by city	
+Radha Mohan	People by status	People by ethnicity	People by ethnicity and occupation	Indian people	Film directors	
+Thomas Craig (jurist)	Scottish society	Alumni by university or college in Europe	Poetry by nation or language	Scottish people by occupation	
+Microsorum vieillardii	Plants	
+Princess Augusta Sophia of the United Kingdom	British monarchy	European royal families	
+Derek Thompson (baseball)	People by status	Minor league baseball players by team	Baseball players by team	Baseball in the United States by state	Major League Baseball	
+Battle of the Coral Sea (film)	History of the United States	Entertainment in the United States	Military personnel	Works by type and year	Companies based in Los Angeles County, California	
+Carvone	Qualia	Organic compounds	Hydrocarbons	
+Red-tailed monkey	Catarrhini	Animals	Vertebrates by country	
+2003 MAC Men's Basketball Tournament	Sports in the United States by city	College men's basketball seasons in the United States	
+Louis Franchet d'Espèrey	French people	People by country and city	Military personnel by war	People by region in France	Military history of France	
+Benjamin Tuke	Irish sportspeople	Rugby union teams	
+Cornelis de Bie	Contents	European writers	People by country and city	Dead people	Alumni by university or college in Europe	Poets by nationality	Belgian people by occupation	
+Washington State Legislature	State governments of the United States	State government in the United States	Bicameral legislatures	
+Đàn đá	Natural materials	Asian music	Hornbostel-Sachs	
+Issa (Senegalese singer)	Life	People by status	Alumni by university or college in the United States by state	People of African descent	Capitals in Africa	
+William Johnston Tupper	Schools	Alumni by university or college in the United States	Canadian people by occupation	Heads of state of former countries	Canadian people by ethnic or national origin	Nova Scotia	Provincial and territorial capitals of Canada	
+Saint Vincent Academy	Roman Catholic Church in the United States	Counties in the New York metropolitan area	High schools and secondary schools	
+Bonkers (song)	Songs by artist	Industry in the United Kingdom	Works by decade	
+Principal axis	
+Thomas Jones (historian)	History of the United States by state	People from New York	People by city or town in England	Local government in New York	English colonization of the Americas	Wars of independence	
+Show Us Your Tiddas!	Plays by nationality	
+LOA	
+The Last Days of the Late, Great State of California	California	California culture	Novels by genre	
+Aladdin (1992 Golden Films film)	Folklore	20th century in the United States	Arts in the United States	Entertainment in the United States	Films by genre	
+Alejandro Ibarra	People by status	States of Mexico	
+Banbury	Local government in England by county	Towns in England by county	
+Swimming at the 1992 Summer Olympics – Men's 100 metre breaststroke	Summer Olympics events by year	
+Underdown	
+Dreaming Lips	
+Escrow	Legal documents	Jargon	Real estate	Finance	Personal finance	
+Aída Álvarez	People by status	History of the United States government	Agencies of the United States government	American people by occupation	People by city in the United States	Alumni by university or college in the United States by state	American politicians	American politicians by state	Businesspeople by nationality	Journalists by nationality	
+Mulberry High School	
+Anaxagoras (disambiguation)	
+Cretinism	Injustice	Endocrine, nutritional and metabolic diseases	Disability	
+Nenad Petrović (chess composer)	Individual sports	Sportspeople by sport and nationality	
+Construction Clients' Group	Building	
+Gmina Osieczna, Pomeranian Voivodeship	Country subdivisions of Europe	Land counties of Poland	
+Mikołaj Gomółka	People by city in Poland	Classical musicians	Polish people by occupation	
+Academy at Central	Cities in the United States by state	
+The Fourth Legacy	Heavy metal albums by genre	
+Wonderful Shadow	Songs by artist	Products by company	Works by decade	
+Kayentavenator	Megafauna	Fossils	
+Jiang Qin	Life	Cities in China	
+Ryukyu flying fox	Contents	Vertebrates by country	Species described in the 19th century	Eating behaviors	
+Casili, Sorsogon	
+Division of Coolgardie	States and territories of Australia	Constituencies	
diff --git a/page2cat.tsv.gz.REMOVED.git-id b/page2cat.tsv.gz.REMOVED.git-id
new file mode 100644
index 0000000..3a21b05
--- /dev/null
+++ b/page2cat.tsv.gz.REMOVED.git-id
@@ -0,0 +1 @@
+e06bd5f41b12cc0b200ad99ac20c29aecff066ba
\ No newline at end of file
diff --git a/ranked-categories-HEAD.tsv b/ranked-categories-HEAD.tsv
new file mode 100644
index 0000000..306d5ea
--- /dev/null
+++ b/ranked-categories-HEAD.tsv
@@ -0,0 +1,20 @@
+Culture	84033.0551327917
+Society	83964.64836795685
+Humanities	81135.57333792261
+Social sciences	79597.60307068429
+Contents	79240.52000298478
+Humans	78310.63217559278
+Nationality	77141.57482468022
+Anthropology	76866.94059820025
+Fields of history	76409.94215997284
+Structure	76149.19697245964
+Academic disciplines	76087.69245474423
+Society by nationality	75270.89088860311
+Cultural spheres of influence	74697.36255461193
+Political philosophy	74500.76841205012
+Subfields by academic discipline	74460.0224443292
+Cultures	74402.63092080808
+Social groups	74076.41203034452
+Places	73634.00505099582
+Sociology of culture	73520.831210539
+Sociology	73343.61321021541
diff --git a/ranked-categories.tsv.gz.REMOVED.git-id b/ranked-categories.tsv.gz.REMOVED.git-id
new file mode 100644
index 0000000..094c143
--- /dev/null
+++ b/ranked-categories.tsv.gz.REMOVED.git-id
@@ -0,0 +1 @@
+54d6da6a5c2f1dee2d12aa4103fc47b87251dd30
\ No newline at end of file
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..e26e0e5
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,11 @@
+WIKIDUMP_XML=enwiki-20160407-pages-articles.xml.bz2
+N_TOP_CATEGORIES=10000
+
+{ java it.unimi.di.wikipedia.categories.CategorySelectionToolchain --help 2>&1 | grep -q "Could not find or load main class"; } && { echo 'Java LlamaFur commands not found. You should compile them and include them in the classpath. Look at the "Compile LlamaFur code" part of readme.md!' ; exit 1; }
+
+java it.unimi.di.wikipedia.parsing.WikipediaCategoryProducer $WIKIDUMP_XML ./ --bzip
+
+java it.unimi.di.wikipedia.categories.CategorySelector categoryPseudotree \
+  page2cat.ser pageId2Name.ser catId2Name.ser \
+  -e "wiki" -e "categories" -e "main topic classifications" -e "template" -e "navigational box" \
+  $N_TOP_CATEGORIES ranked-categories.tsv page2cat.tsv