/*
 * Decompiled with CFR 0.152.
 */
package org.apache.pig.piggybank.evaluation.util.apachelogparser;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;

public class SearchTermExtractor
extends EvalFunc<String> {
    private static Matcher TERM_MATCHER = null;
    private static Matcher P_TERM_MATCHER = null;
    private static HashMap<String, Boolean> HOSTS;

    private String myDecode(String string) {
        try {
            string = URLDecoder.decode(string, "UTF-8");
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return string;
    }

    public String exec(Tuple input) throws IOException {
        if (input == null || input.size() == 0) {
            return null;
        }
        String url = "";
        try {
            url = (String)input.get(0);
            if (url == null) {
                return null;
            }
            URL urlObject = new URL(url);
            if (urlObject == null) {
                return null;
            }
            String host = urlObject.getHost();
            if (host == null) {
                return null;
            }
            if ((host = host.replaceFirst("^www\\.", "")) == null) {
                return null;
            }
            if (HOSTS.containsKey(host = host.toLowerCase()) || host.contains("google.co") || host.contains("search.yahoo")) {
                String queryString = urlObject.getQuery();
                if (queryString == null) {
                    return null;
                }
                TERM_MATCHER.reset(queryString);
                if (TERM_MATCHER.find()) {
                    String terms = TERM_MATCHER.group(1);
                    return this.myDecode(terms);
                }
                P_TERM_MATCHER.reset(queryString);
                if (P_TERM_MATCHER.find()) {
                    String terms = P_TERM_MATCHER.group(1);
                    return this.myDecode(terms);
                }
            }
            if (host.endsWith("feedster.com") || host.endsWith("technorati.com")) {
                String path = urlObject.getPath();
                if (path == null) {
                    return null;
                }
                path = path.replaceFirst("^/search/", "");
                return this.myDecode(path);
            }
            return null;
        }
        catch (MalformedURLException e) {
            return null;
        }
        catch (Exception e) {
            throw new IOException("Caught exception processing input row ", e);
        }
    }

    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
        ArrayList<FuncSpec> funcList = new ArrayList<FuncSpec>();
        funcList.add(new FuncSpec(((Object)((Object)this)).getClass().getName(), new Schema(new Schema.FieldSchema(null, 55))));
        return funcList;
    }

    static {
        TERM_MATCHER = Pattern.compile("\\b(?:q|buscar|key|qry|qs|query|s|searchfor|su|w)=([^&]+)").matcher("");
        P_TERM_MATCHER = Pattern.compile("\\bp=([^&]+)").matcher("");
        HOSTS = new HashMap();
        HOSTS.put("alltheweb.com", true);
        HOSTS.put("altavista.com", true);
        HOSTS.put("aolsearch.aol.com", true);
        HOSTS.put("arianna.libero.it", true);
        HOSTS.put("as.starware.com", true);
        HOSTS.put("ask.com", true);
        HOSTS.put("blogs.icerocket.com", true);
        HOSTS.put("blueyonder.co.uk", true);
        HOSTS.put("busca.orange.es", true);
        HOSTS.put("buscador.lycos.es", true);
        HOSTS.put("buscador.terra.es", true);
        HOSTS.put("buscar.ozu.es", true);
        HOSTS.put("categorico.it", true);
        HOSTS.put("cerca.lycos.it", true);
        HOSTS.put("cuil.com", true);
        HOSTS.put("excite.it", true);
        HOSTS.put("godado.com", true);
        HOSTS.put("godado.it", true);
        HOSTS.put("gps.virgin.net", true);
        HOSTS.put("hotbot.com", true);
        HOSTS.put("ilmotore.com", true);
        HOSTS.put("it.altavista.com", true);
        HOSTS.put("ithaki.net", true);
        HOSTS.put("libero.it", true);
        HOSTS.put("lycos.es", true);
        HOSTS.put("lycos.it", true);
        HOSTS.put("mamma.com", true);
        HOSTS.put("megasearching.net", true);
        HOSTS.put("mirago.co.uk", true);
        HOSTS.put("netscape.com", true);
        HOSTS.put("ozu.es", true);
        HOSTS.put("ricerca.alice.it", true);
        HOSTS.put("search.aol.co.uk", true);
        HOSTS.put("search.bbc.co.uk", true);
        HOSTS.put("search.conduit.com", true);
        HOSTS.put("search.icq.com", true);
        HOSTS.put("search.live.com", true);
        HOSTS.put("search.lycos.co.uk", true);
        HOSTS.put("search.lycos.com", true);
        HOSTS.put("search.msn.co.uk", true);
        HOSTS.put("search.msn.com", true);
        HOSTS.put("search.myway.com", true);
        HOSTS.put("search.mywebsearch.com", true);
        HOSTS.put("search.ntlworld.com", true);
        HOSTS.put("search.orange.co.uk", true);
        HOSTS.put("search.sweetim.com", true);
        HOSTS.put("search.virginmedia.com", true);
        HOSTS.put("simpatico.ws", true);
        HOSTS.put("soso.com", true);
        HOSTS.put("suche.fireball.de", true);
        HOSTS.put("suche.web.de", true);
        HOSTS.put("terra.es", true);
        HOSTS.put("tesco.net", true);
        HOSTS.put("thespider.it", true);
        HOSTS.put("tiscali.co.uk", true);
        HOSTS.put("uk.altavista.com", true);
        HOSTS.put("uk.ask.com", true);
    }
}

