WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);
ExecutorService executor = Executors.newFixedThreadPool(threads);
int cnt = 0;
String page;
while ((page = stream.readNext()) != null) {
String title = cleaner.getTitle(page);
// These are heuristic specifically for filtering out non-articles in enwiki-20120104.
if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
continue;