/*
 * Decompiled with CFR 0.152.
 */
package gr.forth.ics.isl.xlink.textextractor;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import gr.forth.ics.isl.xlink.exceptions.FalseFileTypeException;
import gr.forth.ics.isl.xlink.textextractor.PowerPointTextExtractor;
import gr.forth.ics.isl.xlink.textextractor.TXTTextExtractor;
import gr.forth.ics.isl.xlink.textextractor.TextExtractor;
import gr.forth.ics.isl.xlink.textextractor.WordTextExtractor;
import gr.forth.ics.isl.xlink.textextractor.XMLbasedTextExtractor;
import gr.forth.ics.isl.xlink.util.HTMLTag;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.logging.Level;
import java.util.logging.Logger;

public class WebPageTextExtractor
implements TextExtractor {
    private String webPageUrl;

    public WebPageTextExtractor(String url) {
        this.webPageUrl = url;
    }

    public String extractText() throws FalseFileTypeException {
        String text = "";
        URL url = null;
        try {
            url = new URL(this.webPageUrl);
        }
        catch (MalformedURLException ex) {
            Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }
        URLConnection urlConn = null;
        try {
            urlConn = url.openConnection();
        }
        catch (IOException ex) {
            Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }
        if (urlConn.getContentType().equalsIgnoreCase("application/pdf")) {
            System.out.println("# Reading PDF file!");
            PdfReader reader = null;
            try {
                reader = new PdfReader(url);
            }
            catch (IOException ex) {
                Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
            int n = reader.getNumberOfPages();
            for (int i = 1; i <= n; ++i) {
                try {
                    text = text + PdfTextExtractor.getTextFromPage((PdfReader)reader, (int)i) + "\n";
                    continue;
                }
                catch (IOException ex) {
                    Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
            reader.close();
        } else if (urlConn.getContentType().equalsIgnoreCase("content/unknown")) {
            if (url.getPath().toLowerCase().endsWith("doc") || url.getPath().toLowerCase().endsWith("docx")) {
                System.out.println("# Reading MSWORD file!");
                WordTextExtractor extractor = new WordTextExtractor(url.getPath());
                text = extractor.extractText();
            } else if (url.getPath().toLowerCase().endsWith("ppt") || url.getPath().toLowerCase().endsWith("pptx")) {
                System.out.println("# Reading MSPOWERPOINT file!");
                PowerPointTextExtractor extractor2 = new PowerPointTextExtractor(url.getPath());
                text = extractor2.extractText();
            } else {
                System.out.println("to-do");
            }
        } else if (urlConn.getContentType().equalsIgnoreCase("text/plain")) {
            System.out.println("# Reading txt file!");
            TXTTextExtractor extractor = new TXTTextExtractor(url.getPath());
            text = extractor.extractText();
        } else if (urlConn.getContentType().equalsIgnoreCase("application/xml")) {
            System.out.println("# Reading xml file!");
            XMLbasedTextExtractor extractor = new XMLbasedTextExtractor(url.getPath());
            text = extractor.extractText();
        } else {
            HTMLTag tagger = new HTMLTag(url);
            text = tagger.getSourceCode();
        }
        return text;
    }
}

