Package org.archive.io

Examples of org.archive.io.ReplayCharSequence


        // Ok, if we got this far we need to calculate the content digest.
        // Get the regex
        String regex = getStripRegex();
       
        // Get a replay of the document character seq.
        ReplayCharSequence cs = null;
        try {
           cs = curi.getRecorder().getContentReplayCharSequence();
           // Create a MessageDigest
           MessageDigest digest = null;
           try {
               digest = MessageDigest.getInstance(SHA1);
           } catch (NoSuchAlgorithmException e1) {
               e1.printStackTrace();
               return;
           }

           digest.reset();

           String s = null;

           if (StringUtils.isEmpty(regex)) {
               s = cs.toString();
           } else {
               // Process the document
               Matcher m = TextUtils.getMatcher(regex, cs);
               s = m.replaceAll(" ");
               TextUtils.recycleMatcher(m);
View Full Code Here


    /**
     * @param curi Crawl URI to process.
     */
    public boolean innerExtract(CrawlURI curi) {
        try {
            ReplayCharSequence cs = curi.getRecorder().getContentReplayCharSequence();
            numberOfLinksExtracted.addAndGet(
                processStyleCode(this, curi, cs));
            // Set flag to indicate that link extraction is completed.
            return true;
        } catch (IOException e) {
View Full Code Here

                }
            }
        }

        try {
            ReplayCharSequence cs = curi.getRecorder().getContentReplayCharSequence();
           // Extract all links from the charsequence
           extract(curi, cs);
           if(cs.getDecodeExceptionCount()>0) {
               curi.getNonFatalFailures().add(cs.getCodingException());
           }
           // Set flag to indicate that link extraction is completed.
           return true;
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
View Full Code Here

        return uri.containsDataKey(ExtractorHTML.A_FORM_OFFSETS);
    }

    public void extract(CrawlURI curi) {
        try {
            ReplayCharSequence cs = curi.getRecorder().getContentReplayCharSequence();
            analyze(curi, cs);
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            logger.log(Level.WARNING,"Failed get of replay char sequence in " +
                    Thread.currentThread().getName(), e);
View Full Code Here

            matchLists.put("uriRegex", new MatchList(new GroupList(matcher)));
        } else {
            return; // if uri regex doesn't match, we're done
        }
       
        ReplayCharSequence cs;
        try {
            cs = curi.getRecorder().getContentReplayCharSequence();
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            LOGGER.log(Level.WARNING, "Failed get of replay char sequence in "
View Full Code Here

   

    @Override
    protected boolean innerExtract(CrawlURI curi) {
        this.numberOfCURIsHandled++;
        ReplayCharSequence cs = null;
        try {
            cs = curi.getRecorder().getContentReplayCharSequence();
            try {
                numberOfLinksExtracted.addAndGet(considerStrings(curi, cs));
            } catch (StackOverflowError e) {
View Full Code Here

    private void extract(CrawlURI curi, Recorder recorder) {
        if (!getExtractFromDirs()) {
            return;
        }
       
        ReplayCharSequence seq = null;
        try {
            seq = recorder.getContentReplayCharSequence();
            extract(curi, seq);
        } catch (IOException e) {
            logger.log(Level.SEVERE, "IO error during extraction.", e);
View Full Code Here

    /**
     * @param curi Crawl URI to process.
     */
    @Override
    protected boolean innerExtract(CrawlURI curi) {
        ReplayCharSequence cs = null;
        Charset contentDeclaredEncoding = null;
        try {
            // if charset not spec'd in http header look for <?xml encoding=""?>
            if (!curi.containsContentTypeCharsetDeclaration()) {
                String contentPrefix = curi.getRecorder().getContentReplayPrefixString(50);
View Full Code Here

                        uri.getUURI().toCustomString());
    }

    @Override
    protected void extract(CrawlURI uri) {
        ReplayCharSequence cs;
        try {
            cs = uri.getRecorder().getContentReplayCharSequence();
        } catch (IOException e) {
            uri.getNonFatalFailures().add(e);
            logger.log(Level.WARNING, "Failed get of replay char sequence in "
View Full Code Here

                        uri.getUURI().toCustomString());
    }   
   
    @Override
    protected void extract(CrawlURI uri) {
        ReplayCharSequence cs;
        try {
            cs = uri.getRecorder().getContentReplayCharSequence();
        } catch (IOException e) {
            uri.getNonFatalFailures().add(e);
            logger.log(Level.WARNING, "Failed get of replay char sequence in "
View Full Code Here

TOP

Related Classes of org.archive.io.ReplayCharSequence

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.