package eu.dnetlib.data.collector.plugins.mediawiki;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

import java.net.URL;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;

public class MediawikiIterator implements Iterator<String> {

    private static final Log log = LogFactory.getLog(MediawikiIterator.class);
    private final Queue<String> queue = new PriorityBlockingQueue<String>();
    private final SAXReader reader = new SAXReader();
    private String baseUrl;
    private String apnamespace;
    private String apcontinue;
    private boolean started;

    //	These are the call templates used by this class
    //	http://www.eagle-network.eu/wiki/api.php?action=query&list=allpages&apnamespace=120&aplimit=10&format=xml
    //	http://www.eagle-network.eu/wiki/api.php?action=wbgetentities&ids=Q100|Q1000|Q1001|Q1002|Q1003|Q1004|Q1005|Q1006|Q1007|Q1008

    public MediawikiIterator() {
    }

    public MediawikiIterator(final String baseUrl, final String apnamespace) {
        this.baseUrl = baseUrl;
        this.apnamespace = apnamespace;
        this.started = false;
    }

    private void verifyStarted() {
        if (!this.started) {
            this.apcontinue = firstPage();
            this.started = true;
        }
    }

    @Override
    public boolean hasNext() {
        synchronized (queue) {
            verifyStarted();
            return !queue.isEmpty();
        }
    }

    @Override
    public String next() {
        synchronized (queue) {
            verifyStarted();
            final String res = queue.poll();
            while (queue.isEmpty() && (apcontinue != null) && !apcontinue.isEmpty()) {
                apcontinue = otherPages(apcontinue);
            }
            return res;
        }
    }

    @Override
    public void remove() {
    }

    private String firstPage() {
        String url = baseUrl + "?action=query&list=allpages&apnamespace=" + apnamespace + "&aplimit=50&format=xml";
        log.info("Downloading first page using url: " + url);

        return downloadPage(url);
    }

    private String otherPages(String apcontinue) {
        return downloadPage(baseUrl + "?action=query&list=allpages&apnamespace=" + apnamespace + "&aplimit=50&format=xml&apcontinue=" + apcontinue);
    }

    private String downloadPage(String url) {
        try {
            log.info("HTTP GET: " + url);
            URL request = new URL(url);
            Document doc = reader.read(request.openStream());

            String nextApContinue = doc.valueOf("//*[local-name()='query-continue']//*[local-name()='allpages']/@apcontinue");

            String pagesUrl = "?action=wbgetentities&format=xml&ids=";
            for (Object o : doc.selectNodes("//*[local-name()='p']")) {
                String[] titleTokens = ((Element) o).valueOf("@title").split(":");
                pagesUrl += titleTokens[1] + "|";
            }
            pagesUrl = pagesUrl.substring(0, pagesUrl.length() - 1); //remove last pipe char

            URL pages = new URL(baseUrl + pagesUrl);
            doc = reader.read(pages.openStream());

            // extract single mediawiki entities and enqueue each one separately
            for (Object o : doc.selectNodes("//*[local-name()='entity']")) {
                queue.add(((Node) o).asXML());
            }

            return nextApContinue;
        } catch (Exception e) {
            throw new RuntimeException("Error processing data from: " + url, e);
        }
    }
}
