package org.archive.modules.extractor; import java.io.*; import java.util.*; import org.apache.commons.cli.ParseException; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.ArchiveRecord; import org.archive.modules.DefaultProcessorURI; import org.archive.modules.Processor; import org.archive.modules.ProcessorURI; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.state.ExampleStateProvider; import org.archive.util.Recorder; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpMethodBase; import org.apache.commons.httpclient.HttpConnection; import org.apache.commons.httpclient.URIException; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.CoreAttributeConstants; /** * Link extractors for any given ARC file. * * @author knguyen */ public class ExtractorARC { /** * An ARC reader to iterate over each ARC records from the input */ private ARCReader r; /** * List of extractors that will be applied over each records. */ private ArrayList extractors = new ArrayList(); /** * Output filename. It can be changed with setOutputFileName(String aName) */ private String fOut; /** * Should we digest as we read? */ private boolean digest = false; /** * Should the parse be strict? */ private boolean strict = false; /** * Should we parse header */ private boolean parse = false; /** * * @param an * ARC record * @return a mock-up recorder that gets content from given the ARC record * @throws Exception */ public Recorder createRecorderFromARCRecord(ARCRecord ar) throws Exception { // I am nore sure how NOT to use a temp disk file to // create a recorder. FIX ME please File temp = File.createTempFile("test", ".tmp"); Recorder recorder = new Recorder(temp, 2048, 2048); InputStream is = recorder.inputWrap(ar); int offset = 0; int contentBegin = ar.getHeader().getContentBegin(); for (int x = is.read(); x >= 0; x = is.read()) { if (offset == contentBegin) { // mark where content begins recorder.markContentBegin(); } offset++; } is.close(); return recorder; } /** * Add an extractor or list of extractors to extractor list * * @param an * extractor object */ public void addExtractor(Extractor aE) { extractors.add(aE); if ((aE instanceof ExtractorHTTP) && (r != null)) r.setParseHttpHeaders(true); } public void addExtractor(Collection e) { for (Extractor aE : e) addExtractor(aE); } public ExtractorARC() { } public void extract(DefaultProcessorURI euri) { try { for (Processor e : extractors) { e.process(euri); } } catch (Exception e) { System.out.println("Exception"); e.printStackTrace(); } } /* * public DefaultProcessorURI extract(ARCRecord ar) throws Exception { * System.out.println("extract RAN"); ARCRecordMetaData metaData = * ar.getMetaData(); System.out.println(metaData.toString()); * System.out.println("RECORD LENGTH: " + ar.getMetaData().getLength()); // * manufacture a recorder and an extractor URI // Recorder recorder = * createRecorderFromARCRecord(ar); this.scratchDir = new * File(DEFAULT_SCRATCH); * * if (!this.scratchDir.exists()) { this.scratchDir.mkdirs(); } * * Recorder recorder = * Recorder.wrapInputStreamWithHttpRecord(this.scratchDir, this.getClass() * .getName(), ar, null); // create a mocked-up CURI so that extractors can * use UURI src = UURIFactory.getInstance(metaData.getUrl()); * DefaultProcessorURI euri = new DefaultProcessorURI(src, * LinkContext.NAVLINK_MISC); euri.setRecorder(recorder); * euri.setContentLength(metaData.getLength()); * euri.setContentType(metaData.getMimetype()); // apply all extractors over * an ARC record // This bellow follow ExtractorTool.java from Heritrix 1, * but // I have not been able to get it to work. A ProcessorURI (euri) // * created // by getProcessorURI(ar, recorder) doesn't seem to work with // * extractors. // Usually extractors found no outlinks. // ProcessorURI euri = * getProcessorURI(ar, recorder); * * try { System.out.println(extractors.size()); * System.out.println(extractors.get(0).toString()); for (Processor e : * extractors) { e.process(euri); } Link[] links = * euri.getOutLinks().toArray(new Link[0]); * * for (Link l : links) { System.out.println(l.toString() + "\n"); } } catch * (Exception e) { System.out.println("Exception"); e.printStackTrace(); } * * return euri; } */ /** * extract outlinks from the input. Output is written into a text file * * @throws Exception */ public void extract() throws Exception { BufferedWriter out = new BufferedWriter(new FileWriter(fOut)); Iterator ite = r.iterator(); // move through all the ARC records while (ite.hasNext()) { ARCRecord ar = (ARCRecord) ite.next(); ARCRecordMetaData metaData = ar.getMetaData(); // manufacture a recorder and an extractor URI Recorder recorder = createRecorderFromARCRecord(ar); // create a mocked-up CURI so that extractors can use UURI src = UURIFactory.getInstance(metaData.getUrl()); DefaultProcessorURI euri = new DefaultProcessorURI(src, LinkContext.NAVLINK_MISC); euri.setRecorder(recorder); euri.setContentLength(metaData.getLength()); euri.setContentType(metaData.getMimetype()); // apply all extractors over an ARC record // This bellow follow ExtractorTool.java from Heritrix 1, but // I have not been able to get it to work. A ProcessorURI (euri) // created // by getProcessorURI(ar, recorder) doesn't seem to work with // extractors. // Usually extractors found no outlinks. // ProcessorURI euri = getProcessorURI(ar, recorder); try { for (Processor e : extractors) { e.process(euri); } } catch (Exception e) { // if something goes wrong, report it via output file as well out.write("EXCEPTION \n"); out.write(metaData.toString() + "\n"); e.printStackTrace(); out.write("END_EXCEPTION \n"); } // write extracted links to output file Link[] links = euri.getOutLinks().toArray(new Link[0]); out.write(ar.getHeader().getUrl() + "\n"); out.write("< \n"); for (Link l : links) { out.write(l.toString() + "\n"); } out.write("> \n"); } // close out.close(); r.close(); } protected ProcessorURI getProcessorURI(final ARCRecord record, final Recorder r) throws URIException { UURI src = UURIFactory.getInstance(record.getMetaData().getUrl()); DefaultProcessorURI curi = new DefaultProcessorURI(src, LinkContext.NAVLINK_MISC); // CrawlURI curi = new // CrawlURI(UURIFactory.getInstance(record.getMetaData().getUrl())); // // when this constructor is used instead of above to create a URI object // (follow // Heritrix1 code), I get "ToeThread never set up CrawlURI's sheet" // exception (line 1531, // CrawlURI.java). It has something to do with StateProvider. I don't // know how to fix it. // Please help if you think the above approach is only suboptimal. curi.setContentLength(record.getMetaData().getLength()); curi.setContentType(record.getMetaData().getMimetype()); curi.setRecorder(r); // Fake out the extractor that this is a legit HTTP transaction. if (!curi.getUURI().getScheme().equals("filedesc")) { // before, in Heritrix 1, it was // curi.putObject(CoreAtttributeConstants.....etc) // please double-check if my modification is an equivalent curi.getData().put(CoreAttributeConstants.A_HTTP_TRANSACTION, new HttpMethodBase() { public String getName() { return this.getClass().getName() + "_method"; } public Header getResponseHeader(String headerName) { String value = (String) record.getMetaData() .getHeaderValue(headerName); return (value == null || value.length() == 0) ? null : new Header(headerName, value); } }); String statusCode = record.getMetaData().getStatusCode(); curi.setFetchStatus(statusCode == null ? 200 : Integer .parseInt(statusCode)); } return curi; } /** * Constructor * * @param urlOrPath * of the ARC file * @throws ParseException * @throws IOException * @throws java.text.ParseException */ public ExtractorARC(String urlOrPath) throws ParseException, IOException, java.text.ParseException { try { r = ARCReaderFactory.get(urlOrPath); r.setStrict(strict); r.setParseHttpHeaders(parse); r.setDigest(digest); } catch (RuntimeException e) { System.err.println("Exception processing " + urlOrPath + ": " + e.getMessage()); e.printStackTrace(System.err); System.exit(1); } fOut = r.getFileName() + "_" + System.currentTimeMillis() + "_outlinks.txt"; } public ARCReader getARCReader() { return r; } private static Options getOptions() { Options options = new Options(); options.addOption(new Option("p", "parse", false, "Parse headers.")); options.addOption(new Option("d", "digest", true, "Pass true|false. Expensive. Default: true (SHA-1).")); options.addOption(new Option("s", "strict", false, "Strict mode. Fails parse if incorrectly formatted file.")); return options; } public static void main(String[] args) throws ParseException, IOException, java.text.ParseException { Options options = getOptions(); PosixParser parser = new PosixParser(); CommandLine cmdline = parser.parse(options, args, false); List cmdlineArgs = cmdline.getArgList(); Option[] cmdlineOptions = cmdline.getOptions(); HelpFormatter formatter = new HelpFormatter(); boolean digest = false; boolean strict = false; boolean parse = false; for (int i = 0; i < cmdlineOptions.length; i++) { switch (cmdlineOptions[i].getId()) { case 's': strict = true; break; case 'p': parse = true; break; case 'd': digest = getTrueOrFalse(cmdlineOptions[i].getValue()); break; default: throw new RuntimeException("Unexpected option: " + +cmdlineOptions[i].getId()); } } for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) { String urlOrPath = (String) i.next(); try { ExtractorARC extractor = new ExtractorARC(urlOrPath); extractor.getARCReader().setDigest(digest); extractor.getARCReader().setParseHttpHeaders(parse); extractor.getARCReader().setStrict(strict); extractor.extract(); } catch (Exception e) { } } } private static boolean getTrueOrFalse(final String value) { if (value == null || value.length() <= 0) { return false; } return Boolean.TRUE.toString().equals(value.toLowerCase()); } }