/**
 * Copyright (C) 2015 - 2018 Kosmos contact@kosmos.fr
 *
 * Projet: core
 * Version: 6.02.48
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.jsbsoft.jtf.textsearch.sitesdistants;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.apache.oro.text.perl.Perl5Util;
import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Singleton regroupant des méthodes utilisées pour le parsing de page HTML.
 *
 * @author jbiard
 */
public class TraitementPageHTML {

	// expression reguliere permettant d'isoler les liens hypertextes
	/** The Constant REGEXP_HREF. */
	public final static String REGEXP_HREF = "(([Ss][Rr][Cc])|([Hh][Rr][Ee][Ff]))[ \\t\\n\\x0B\\f\\r]*=[ \\t\\n\\x0B\\f\\r]*\"[^\"]+\"";

	//     "[Hh][Rr][Ee][Ff][ \\t\\n\\x0B\\f\\r]*=[ \\t\\n\\x0B\\f\\r]*\"[^\"]+\"";
	// expression reguliere permettant d'isoler les css, le javascript et le
	// mailto
	/** The Constant REGEXP_NO_CSS_JSCRIPT_MAILTO. */
	public final static String REGEXP_NO_CSS_JSCRIPT_MAILTO = "\\.*(\\.css)|(javascript)|(mailto)|(gif)|(jpg)|(swf)";

	private static Logger LOG = LoggerFactory.getLogger(TraitementPageHTML.class);

	/**
	 * Accesseur statique garantissant l'unicité (singleton).
	 *
	 * @return instance unique
	 */
	public static TraitementPageHTML GetInstance() {
		if (_instance == null) {
			synchronized (TraitementPageHTML.class) {
				// test pour le 2eme thread
				if (_instance == null) {
					_instance = new TraitementPageHTML();
					try {
						final Perl5Compiler compiler = new Perl5Compiler();
						// on reutilise le meme pattern
						_regexpPattern = compiler.compile(TraitementPageHTML.REGEXP_HREF);
						_regexpPatternNoCSSJScriptMailTo = compiler.compile(TraitementPageHTML.REGEXP_NO_CSS_JSCRIPT_MAILTO, Perl5Compiler.CASE_INSENSITIVE_MASK);
					} catch (final MalformedPatternException e) {
						LOG.error("mauvais pattern", e);
					}
				}
			}
		}
		return _instance;
	}

	/**
	 * Retourne la page HTML sous forme de chaîne.
	 *
	 * @param url
	 *            url pour la recuperation
	 *
	 * @return page
	 *
	 * @throws IOException
	 *             levee si probleme d'acces a l'url
	 */
	public String getPageHTML(final URL url) throws IOException {
		final StringWriter sw = new StringWriter();
		sauvegardePage(url, sw);
		return sw.toString();
	}

	/**
	 * Extrait les liens de la page en utilisant les expressions regulieres.
	 *
	 * @param szPage
	 *            page source
	 * @param szUrlPremierePage
	 *            url de la premiere page attaquee, utilisee pour les urls absolues (debutant par /)
	 *
	 * @return liste des liens extraits
	 */
	public List<String> extraireLiens(final String szPage, final String szUrlPremierePage) {
		int nIndex;
		// recuperation de l'url de base du site a partir de la premiere page
		// attaquee
		URL urlBase = null;
		nIndex = szUrlPremierePage.lastIndexOf('/');
		if (nIndex > 0) {
			try {
				urlBase = new URL(szUrlPremierePage.substring(0, nIndex));
			} catch (final MalformedURLException e) {
				LOG.error("mauvaise URL", e);
			}
		}
		PatternMatcherInput pmiLigne;
		String szResultat;
		pmiLigne = new PatternMatcherInput(szPage);
		final Perl5Matcher matcher = new Perl5Matcher();
		final List<String> lHref = new ArrayList<String>();
		String szHref;
		URL u;
		int nIndexDiese;
		// on boucle sur les items matchant
		while (matcher.contains(pmiLigne, _regexpPattern)) {
			szResultat = matcher.getMatch().toString();
			szHref = szResultat.substring(szResultat.indexOf('"') + 1, szResultat.lastIndexOf('"'));
			// 20050809 JB indexation des url avec #, on se limite seulement à la racine
			nIndexDiese = szHref.indexOf('#');
			if (nIndexDiese != -1) {
				szHref = szHref.substring(0, nIndexDiese);
			}
			// on s'assure que ce n'est pas une css ou bien du javascript ou
			// bien un mail
			if (!matcher.contains(szHref, _regexpPatternNoCSSJScriptMailTo)) {
				if (!szHref.toLowerCase().startsWith("http") && (urlBase != null)) {
					// uniquement les urls relatives
					// 20050805 JB : on laisse URL construire l'url absolue
					try {
						u = new URL(urlBase, szHref);
						lHref.add(u.toString());
					} catch (final MalformedURLException e) {
						LOG.error("mauvaise URL", e);
					}
				} else {
					lHref.add(szHref);
				}
			}
		}
		return lHref;
	}

	/**
	 * this parses the robots.txt file. It was taken from the PERL implementation Since this is only rarely called, it's not optimized for speed
	 *
	 * @param r
	 *            the robots.txt file
	 *
	 * @return the disallows
	 *
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 *
	 * @exception IOException
	 *                any IOException
	 */
	public String[] parse(final BufferedReader r) throws IOException {
		// taken from Perl
		final Perl5Util p = new Perl5Util();
		String line;
		final boolean isMe = false;
		boolean isAnon = false;
		final ArrayList<String> disallowed = new ArrayList<String>();
		final String ua = null;
		while ((line = r.readLine()) != null) {
			if (p.match("/^#.*/", line)) {
				// a comment
				continue;
			}
			line = p.substitute("s/\\s*\\#.* //", line);
			if (p.match("/^\\s*$/", line)) {
				if (isMe) {
					break;
				}
			} else if (p.match("/^Disallow:\\s*(.*)/i", line)) {
				if (ua == null) {
					isAnon = true;
					// warn...
				}
				String disallow = p.group(1);
				if (disallow != null && disallow.length() > 0) {
					// assume we have a relative path
				} else {
					disallow = "/";
				}
				if (isMe || isAnon) {
					disallowed.add(disallow);
				}
			} else {
				// warn: unexpected line
			}
		}
		final String[] disalloweds = new String[disallowed.size()];
		disallowed.toArray(disalloweds);
		return disalloweds;
	}

	/**
	 * Ecrit dans un write la page HTML.
	 *
	 * @param url
	 *            url a aspirer
	 * @param writer
	 *            writer cible
	 *
	 * @throws IOException
	 *             levee si probleme d'acces a l'url
	 */
	protected void sauvegardePage(final URL url, final Writer writer) throws IOException {
		final BufferedInputStream in = new BufferedInputStream(url.openStream());
		for (int c = in.read(); c != -1; c = in.read()) {
			writer.write(c);
		}
	}

	/**
	 * Constructeur prive : singleton.
	 */
	private TraitementPageHTML() {}

	/** The _instance. */
	private static TraitementPageHTML _instance;

	/** The _regexp pattern no cssj script mail to. */
	private static Pattern _regexpPattern, _regexpPatternNoCSSJScriptMailTo;
}