public class Wikipedia { public static class Article implements Serializable { private static final long serialVersionUID = 1L; public int id; public String title; public Set related = new HashSet(); } private static enum NodeType { mediawiki, page, ns, id, revision, title, text, unknown } private static Map nodeMap = new HashMap() {{ put("mediawiki", NodeType.mediawiki); put("page", NodeType.page); put("ns", NodeType.ns); put("id", NodeType.id); put("revision", NodeType.revision); put("title", NodeType.title); put("text", NodeType.text); }}; public static void main(String[] args) throws Exception { String location = "C:\\Users\\pdiffenderfer\\Downloads\\enwiki-20140304-pages-articles-multistream.xml"; String out = "./wikipedia.txt"; SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); // mediawiki>page> // ns=0 // title // id // revision>text final Stack stack = new Stack(); final Map articles = new HashMap(); final PrintStream stream = new PrintStream( out ); final AtomicLong read = new AtomicLong(); DefaultHandler handler = new DefaultHandler() { Article current; StringBuilder text = new StringBuilder(); int ns; public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { NodeType type = nodeMap.get( qName ); if (type == null) { type = NodeType.unknown; } stack.push( type ); if (type == NodeType.page) { current = new Article(); } } public void endElement(String uri, String localName, String qName) throws SAXException { NodeType popped = stack.pop(); if (popped == NodeType.page) { if (ns == 0) { articles.put( current.title, current ); read.incrementAndGet(); if (articles.size() == 10000) { for (Article a : articles.values()) { print(stream, a); } articles.clear(); System.out.println("Articles Read: " + read.get()); } int start = text.indexOf( "[[" ); while (start != -1) { int end = consumeUntil( text, "]]", start + 2 ); String r = text.substring( start + 2, end ); if (r.indexOf( ':' ) == -1 && r.indexOf( '#' ) == -1 && r.indexOf( "{" ) == -1 && r.indexOf( '/' ) != 0) { int rr = r.indexOf( '|' ); if (rr != -1) { r = r.substring( 0, rr ); } current.related.add( r ); } if (end == text.length()) { break; } start = text.indexOf( "[[", end ); } text.setLength( 0 ); } current = null; } } public void characters(char ch[], int start, int length) throws SAXException { NodeType currentType = stack.peek(); switch (currentType) { case id: current.id = Integer.valueOf( String.valueOf( ch, start, length ) ); break; case ns: ns = Integer.valueOf( String.valueOf( ch, start, length ) ); break; case title: current.title = String.valueOf( ch, start, length ); break; case text: text.append( ch, start, length ); break; default: break; } } }; saxParser.parse(location, handler); for (Article a : articles.values()) { print(stream, a); } stream.close(); System.out.println("Articles Read: " + read.get()); } private static void print(PrintStream out, Article a) { out.format( "%d>%s>%d", a.id, a.title, a.related.size() ); for (String x : a.related) { out.print( '>' ); out.print( x ); } out.println(); } private static int consumeUntil(StringBuilder haystack, String needle, int start) { final char[] needleChars = needle.toCharArray(); final int needleLength = needleChars.length; final int max = haystack.length() - needleLength; while (start <= max) { boolean match = true; for (int i = 0; i < needleLength; i++) { match &= (needleChars[i] == haystack.charAt( i + start )); } if (match) { break; } start++; } return start; } }