package org.archive.modules.extractor;

import java.io.*;
import java.util.*;

import org.apache.commons.cli.ParseException;

import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.ArchiveRecord;

import org.archive.modules.DefaultProcessorURI;
import org.archive.modules.Processor;
import org.archive.modules.ProcessorURI;

import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.state.ExampleStateProvider;
import org.archive.util.Recorder;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpMethodBase;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.URIException;

import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.CoreAttributeConstants;

/**
 * Link extractors for any given ARC file.
 * 
 * @author knguyen
 */

public class ExtractorARC {

	/**
	 * An ARC reader to iterate over each ARC records from the input
	 */
	private ARCReader r;

	/**
	 * List of extractors that will be applied over each records.
	 */
	private ArrayList<Processor> extractors = new ArrayList<Processor>();
	/**
	 * Output filename. It can be changed with setOutputFileName(String aName)
	 */
	private String fOut;

	/**
	 * Should we digest as we read?
	 */
	private boolean digest = false;

	/**
	 * Should the parse be strict?
	 */
	private boolean strict = false;

	/**
	 * Should we parse header
	 */
	private boolean parse = false;

	/**
	 * 
	 * @param an
	 *            ARC record
	 * @return a mock-up recorder that gets content from given the ARC record
	 * @throws Exception
	 */
	public Recorder createRecorderFromARCRecord(ARCRecord ar) throws Exception {

		// I am nore sure how NOT to use a temp disk file to
		// create a recorder. FIX ME please
		File temp = File.createTempFile("test", ".tmp");
		Recorder recorder = new Recorder(temp, 2048, 2048);

		InputStream is = recorder.inputWrap(ar);

		int offset = 0;
		int contentBegin = ar.getHeader().getContentBegin();
		for (int x = is.read(); x >= 0; x = is.read()) {
			if (offset == contentBegin) { // mark where content begins
				recorder.markContentBegin();
			}

			offset++;
		}

		is.close();
		return recorder;
	}

	/**
	 * Add an extractor or list of extractors to extractor list
	 * 
	 * @param an
	 *            extractor object
	 */
	public void addExtractor(Extractor aE) {
		extractors.add(aE);
		if ((aE instanceof ExtractorHTTP) && (r != null))
			r.setParseHttpHeaders(true);
	}

	public void addExtractor(Collection<Extractor> e) {
		for (Extractor aE : e)
			addExtractor(aE);
	}

	public ExtractorARC() {
	}

	public void extract(DefaultProcessorURI euri) {
		try {
			for (Processor e : extractors) {
				e.process(euri);
			}

		} catch (Exception e) {
			System.out.println("Exception");
			e.printStackTrace();
		}
	}

	/*
	 * public DefaultProcessorURI extract(ARCRecord ar) throws Exception {
	 * System.out.println("extract RAN"); ARCRecordMetaData metaData =
	 * ar.getMetaData(); System.out.println(metaData.toString());
	 * System.out.println("RECORD LENGTH: " + ar.getMetaData().getLength()); //
	 * manufacture a recorder and an extractor URI // Recorder recorder =
	 * createRecorderFromARCRecord(ar); this.scratchDir = new
	 * File(DEFAULT_SCRATCH);
	 * 
	 * if (!this.scratchDir.exists()) { this.scratchDir.mkdirs(); }
	 * 
	 * Recorder recorder =
	 * Recorder.wrapInputStreamWithHttpRecord(this.scratchDir, this.getClass()
	 * .getName(), ar, null); // create a mocked-up CURI so that extractors can
	 * use UURI src = UURIFactory.getInstance(metaData.getUrl());
	 * DefaultProcessorURI euri = new DefaultProcessorURI(src,
	 * LinkContext.NAVLINK_MISC); euri.setRecorder(recorder);
	 * euri.setContentLength(metaData.getLength());
	 * euri.setContentType(metaData.getMimetype()); // apply all extractors over
	 * an ARC record // This bellow follow ExtractorTool.java from Heritrix 1,
	 * but // I have not been able to get it to work. A ProcessorURI (euri) //
	 * created // by getProcessorURI(ar, recorder) doesn't seem to work with //
	 * extractors. // Usually extractors found no outlinks. // ProcessorURI euri =
	 * getProcessorURI(ar, recorder);
	 * 
	 * try { System.out.println(extractors.size());
	 * System.out.println(extractors.get(0).toString()); for (Processor e :
	 * extractors) { e.process(euri); } Link[] links =
	 * euri.getOutLinks().toArray(new Link[0]);
	 * 
	 * for (Link l : links) { System.out.println(l.toString() + "\n"); } } catch
	 * (Exception e) { System.out.println("Exception"); e.printStackTrace(); }
	 * 
	 * return euri; }
	 */

	/**
	 * extract outlinks from the input. Output is written into a text file
	 * 
	 * @throws Exception
	 */
	public void extract() throws Exception {

		BufferedWriter out = new BufferedWriter(new FileWriter(fOut));

		Iterator<ArchiveRecord> ite = r.iterator();

		// move through all the ARC records
		while (ite.hasNext()) {
			ARCRecord ar = (ARCRecord) ite.next();
			ARCRecordMetaData metaData = ar.getMetaData();

			// manufacture a recorder and an extractor URI
			Recorder recorder = createRecorderFromARCRecord(ar);

			// create a mocked-up CURI so that extractors can use
			UURI src = UURIFactory.getInstance(metaData.getUrl());
			DefaultProcessorURI euri = new DefaultProcessorURI(src,
					LinkContext.NAVLINK_MISC);
			euri.setRecorder(recorder);
			euri.setContentLength(metaData.getLength());
			euri.setContentType(metaData.getMimetype());
			// apply all extractors over an ARC record

			// This bellow follow ExtractorTool.java from Heritrix 1, but
			// I have not been able to get it to work. A ProcessorURI (euri)
			// created
			// by getProcessorURI(ar, recorder) doesn't seem to work with
			// extractors.
			// Usually extractors found no outlinks.
			// ProcessorURI euri = getProcessorURI(ar, recorder);

			try {
				for (Processor e : extractors) {
					e.process(euri);
				}
			} catch (Exception e) {
				// if something goes wrong, report it via output file as well
				out.write("EXCEPTION \n");
				out.write(metaData.toString() + "\n");
				e.printStackTrace();
				out.write("END_EXCEPTION \n");
			}

			// write extracted links to output file
			Link[] links = euri.getOutLinks().toArray(new Link[0]);
			out.write(ar.getHeader().getUrl() + "\n");
			out.write("< \n");
			for (Link l : links) {
				out.write(l.toString() + "\n");
			}
			out.write("> \n");

		}

		// close
		out.close();
		r.close();
	}

	protected ProcessorURI getProcessorURI(final ARCRecord record,
			final Recorder r) throws URIException {

		UURI src = UURIFactory.getInstance(record.getMetaData().getUrl());
		DefaultProcessorURI curi = new DefaultProcessorURI(src,
				LinkContext.NAVLINK_MISC);

		// CrawlURI curi = new
		// CrawlURI(UURIFactory.getInstance(record.getMetaData().getUrl()));
		//
		// when this constructor is used instead of above to create a URI object
		// (follow
		// Heritrix1 code), I get "ToeThread never set up CrawlURI's sheet"
		// exception (line 1531,
		// CrawlURI.java). It has something to do with StateProvider. I don't
		// know how to fix it.
		// Please help if you think the above approach is only suboptimal.

		curi.setContentLength(record.getMetaData().getLength());
		curi.setContentType(record.getMetaData().getMimetype());
		curi.setRecorder(r);

		// Fake out the extractor that this is a legit HTTP transaction.
		if (!curi.getUURI().getScheme().equals("filedesc")) {
			// before, in Heritrix 1, it was
			// curi.putObject(CoreAtttributeConstants.....etc)
			// please double-check if my modification is an equivalent
			curi.getData().put(CoreAttributeConstants.A_HTTP_TRANSACTION,
					new HttpMethodBase() {
						public String getName() {
							return this.getClass().getName() + "_method";
						}

						public Header getResponseHeader(String headerName) {
							String value = (String) record.getMetaData()
									.getHeaderValue(headerName);
							return (value == null || value.length() == 0) ? null
									: new Header(headerName, value);
						}
					});
			String statusCode = record.getMetaData().getStatusCode();
			curi.setFetchStatus(statusCode == null ? 200 : Integer
					.parseInt(statusCode));
		}
		return curi;
	}

	/**
	 * Constructor
	 * 
	 * @param urlOrPath
	 *            of the ARC file
	 * @throws ParseException
	 * @throws IOException
	 * @throws java.text.ParseException
	 */
	public ExtractorARC(String urlOrPath) throws ParseException, IOException,
			java.text.ParseException {
		try {
			r = ARCReaderFactory.get(urlOrPath);
			r.setStrict(strict);
			r.setParseHttpHeaders(parse);
			r.setDigest(digest);

		} catch (RuntimeException e) {
			System.err.println("Exception processing " + urlOrPath + ": "
					+ e.getMessage());
			e.printStackTrace(System.err);
			System.exit(1);
		}
		fOut = r.getFileName() + "_" + System.currentTimeMillis()
				+ "_outlinks.txt";
	}

	public ARCReader getARCReader() {
		return r;
	}

	private static Options getOptions() {
		Options options = new Options();
		options.addOption(new Option("p", "parse", false, "Parse headers."));
		options.addOption(new Option("d", "digest", true,
				"Pass true|false. Expensive. Default: true (SHA-1)."));
		options.addOption(new Option("s", "strict", false,
				"Strict mode. Fails parse if incorrectly formatted file."));
		return options;
	}

	public static void main(String[] args) throws ParseException, IOException,
			java.text.ParseException {
		Options options = getOptions();
		PosixParser parser = new PosixParser();
		CommandLine cmdline = parser.parse(options, args, false);
		List cmdlineArgs = cmdline.getArgList();
		Option[] cmdlineOptions = cmdline.getOptions();
		HelpFormatter formatter = new HelpFormatter();

		boolean digest = false;
		boolean strict = false;
		boolean parse = false;
		for (int i = 0; i < cmdlineOptions.length; i++) {
			switch (cmdlineOptions[i].getId()) {
			case 's':
				strict = true;
				break;
			case 'p':
				parse = true;
				break;
			case 'd':
				digest = getTrueOrFalse(cmdlineOptions[i].getValue());
				break;
			default:
				throw new RuntimeException("Unexpected option: "
						+ +cmdlineOptions[i].getId());
			}
		}

		for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
			String urlOrPath = (String) i.next();
			try {
				ExtractorARC extractor = new ExtractorARC(urlOrPath);
				extractor.getARCReader().setDigest(digest);
				extractor.getARCReader().setParseHttpHeaders(parse);
				extractor.getARCReader().setStrict(strict);
				extractor.extract();
			} catch (Exception e) {

			}
		}
	}

	private static boolean getTrueOrFalse(final String value) {
		if (value == null || value.length() <= 0) {
			return false;
		}
		return Boolean.TRUE.toString().equals(value.toLowerCase());
	}

}