PagesArticlesXmlParser.java

import java.io.FileInputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.events.XMLEvent;

import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.common.SolrInputDocument;

/**
 * Wikipediaのjawiki-latest-pages-articles.xmlを解析する
 */
public class PagesArticlesXmlParser {

    /** XMLで使われてる日付形式 */
    static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");

    /** main */
    public static void main(String[] args) throws Exception {

        long start = System.currentTimeMillis();

        SolrServer server = new CommonsHttpSolrServer("http://localhost:8983/solr");

        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader reader = factory.createXMLEventReader(new FileInputStream(
                "jawiki-latest-pages-articles.xml"));

        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            if (isStartElem(event, "page")) {
                WikipediaModel model = pageParse(reader);
                if (model != null)
                    docs.add(model.getDocument());
                if (docs.size() >= 100) {
                    server.add(docs);
                    docs.clear();
                }
            }
        }
        server.add(docs);

        reader.close();
        server.commit();
        server.optimize();

        System.out.println((System.currentTimeMillis() - start) + "msec");
    }

    /** page element内の解析 */
    private static WikipediaModel pageParse(XMLEventReader reader) throws Exception {
        WikipediaModel model = new WikipediaModel();
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            if (isEndElem(event, "page"))
                break;
            // revision elementの解析は、revisonParseにて行う
            else if (isStartElem(event, "revision"))
                revisionParse(reader, model);
            // title
            else if (isStartElem(event, "title")) {
                String title = getText(reader, "title");
                // タイトルにコロンが含まれる場合は管理用記事なのでスキップする
                if (title.indexOf(':') != -1)
                    return null;
                // (曖昧さ回避)や(音楽)などの注釈文字を外す
                int posStart = title.indexOf(" (");
                int posEnd = title.indexOf(')', posStart);
                if (posStart != -1 && posEnd != -1) {
                    model.setTitle(title.substring(0, posStart));
                    model.setTitleAnnotation(title.substring(posStart + 2, posEnd));
                } else {
                    model.setTitle(title);
                }
            } else if (isStartElem(event, "id"))
                model.setId(getText(reader, "id"));
        }
        return model;
    }

    /** revision element内の解析 */
    private static void revisionParse(XMLEventReader reader, WikipediaModel model) throws Exception {
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            if (isEndElem(event, "revision"))
                break;
            else if (isStartElem(event, "text"))
                model.setText(getText(reader, "text"));
            else if (isStartElem(event, "timestamp"))
                model.setLastModified(sdf.parse(getText(reader, "timestamp")));
        }
    }

    /** 指定のend tagを発見するまで、CHARACTERSを取得 */
    private static String getText(XMLEventReader reader, String name) throws Exception {
        StringBuilder builder = new StringBuilder();
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            if (isEndElem(event, name))
                break;
            else if (event.getEventType() == XMLStreamConstants.CHARACTERS) {
                String data = event.asCharacters().getData().trim();
                if (data.length() > 0)
                    builder.append(data);
            }
        }
        return builder.toString();
    }

    /** 指定名のStart Elementか判定する */
    private static boolean isStartElem(XMLEvent event, String name) {
        return event.getEventType() == XMLStreamConstants.START_ELEMENT
                && name.equals(event.asStartElement().getName().getLocalPart());
    }

    /** 指定名のEnd Elementか判定する */
    private static boolean isEndElem(XMLEvent event, String name) {
        return event.getEventType() == XMLStreamConstants.END_ELEMENT
                && name.equals(event.asEndElement().getName().getLocalPart());
    }
}

/*
 *           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 *                   Version 2, December 2004
 * 
 * Copyright (C) 2011 mwSoft
 * 
 * Everyone is permitted to copy and distribute verbatim or modified
 * copies of this license document, and changing it is allowed as long
 * as the name is changed.
 *  
 *            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 *   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 *  
 *  0. You just DO WHAT THE FUCK YOU WANT TO.
 */