Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.ProtocolStatus


                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
              }
            }
            final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
            final ProtocolStatus status = output.getStatus();
            final Content content = output.getContent();
            // unblock queue
            fetchQueues.finishFetchItem(fit);

            context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode())).increment(1);

            int length = 0;
            if (content!=null && content.getContent()!=null) length= content.getContent().length;
            updateStatus(length);

            switch(status.getCode()) {

            case ProtocolStatusCodes.WOULDBLOCK:
              // retry ?
              fetchQueues.addFetchItem(fit);
              break;

            case ProtocolStatusCodes.SUCCESS:        // got a page
              output(fit, content, status, CrawlStatus.STATUS_FETCHED);
              break;

            case ProtocolStatusCodes.MOVED:         // redirect
            case ProtocolStatusCodes.TEMP_MOVED:
              byte code;
              boolean temp;
              if (status.getCode() == ProtocolStatusCodes.MOVED) {
                code = CrawlStatus.STATUS_REDIR_PERM;
                temp = false;
              } else {
                code = CrawlStatus.STATUS_REDIR_TEMP;
                temp = true;
              }
              final String newUrl = ProtocolStatusUtils.getMessage(status);
              handleRedirect(fit.url, newUrl, temp,  FetcherJob.PROTOCOL_REDIR, fit.page);
              output(fit, content, status, code);
              break;
            case ProtocolStatusCodes.EXCEPTION:
              logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
              /* FALLTHROUGH */
            case ProtocolStatusCodes.RETRY:          // retry
            case ProtocolStatusCodes.BLOCKED:
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
              break;

            case ProtocolStatusCodes.GONE:           // gone
            case ProtocolStatusCodes.NOTFOUND:
            case ProtocolStatusCodes.ACCESS_DENIED:
            case ProtocolStatusCodes.ROBOTS_DENIED:
              output(fit, null, status, CrawlStatus.STATUS_GONE);
              break;

            case ProtocolStatusCodes.NOTMODIFIED:
              output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
              break;

            default:
              if (LOG.isWarnEnabled()) {
                LOG.warn("Unknown ProtocolStatus: " + status.getCode());
              }
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
            }

          } catch (final Throwable t) {                 // unexpected exception
View Full Code Here


        } else { // convert to exception
          throw new FtpError(code);
        }
      }
    } catch (Exception e) {
      ProtocolStatus ps = ProtocolStatusUtils.makeStatus(
          ProtocolStatusCodes.EXCEPTION, e.toString());
      return new ProtocolOutput(null, ps);
    }
  }
View Full Code Here

                  Bytes.toStringBinary(entry.getValue()));
            }
          }
          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
View Full Code Here

                  Bytes.toStringBinary(entry.getValue().array()));
            }
          }
          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
View Full Code Here

      return "BLOCKED";
    return "UNKNOWN_CODE_" + code;
  }

  public static ProtocolStatus makeStatus(int code) {
    ProtocolStatus pstatus = new ProtocolStatus();
    pstatus.setCode(code);
    pstatus.setLastModified(0);
    return pstatus;
  }
View Full Code Here

    pstatus.setLastModified(0);
    return pstatus;
  }

  public static ProtocolStatus makeStatus(int code, String message) {
    ProtocolStatus pstatus = makeStatus(code);
    pstatus.addToArgs(new Utf8(message));
    return pstatus;
  }
View Full Code Here

                  final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                }
              }
              final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
              final ProtocolStatus status = output.getStatus();
              final Content content = output.getContent();
              // unblock queue
              fetchQueues.finishFetchItem(fit);

              context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode())).increment(1);

              int length = 0;
              if (content!=null && content.getContent()!=null) length= content.getContent().length;
              updateStatus(length);

              switch(status.getCode()) {

              case ProtocolStatusCodes.WOULDBLOCK:
                // retry ?
                fetchQueues.addFetchItem(fit);
                break;

              case ProtocolStatusCodes.SUCCESS:        // got a page
                output(fit, content, status, CrawlStatus.STATUS_FETCHED);
                break;

              case ProtocolStatusCodes.MOVED:         // redirect
              case ProtocolStatusCodes.TEMP_MOVED:
                byte code;
                boolean temp;
                if (status.getCode() == ProtocolStatusCodes.MOVED) {
                  code = CrawlStatus.STATUS_REDIR_PERM;
                  temp = false;
                } else {
                  code = CrawlStatus.STATUS_REDIR_TEMP;
                  temp = true;
                }
                output(fit, content, status, code);
                final String newUrl = ProtocolStatusUtils.getMessage(status);
                handleRedirect(fit.url, newUrl, temp,  FetcherJob.PROTOCOL_REDIR);
                redirecting = false;
                break;
              case ProtocolStatusCodes.EXCEPTION:
                logError(fit.url, ProtocolStatusUtils.getMessage(status));
                /* FALLTHROUGH */
              case ProtocolStatusCodes.RETRY:          // retry
              case ProtocolStatusCodes.BLOCKED:
                output(fit, null, status, CrawlStatus.STATUS_RETRY);
                break;

              case ProtocolStatusCodes.GONE:           // gone
              case ProtocolStatusCodes.NOTFOUND:
              case ProtocolStatusCodes.ACCESS_DENIED:
              case ProtocolStatusCodes.ROBOTS_DENIED:
                output(fit, null, status, CrawlStatus.STATUS_GONE);
                break;

              case ProtocolStatusCodes.NOTMODIFIED:
                output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
                break;

              default:
                if (LOG.isWarnEnabled()) {
                  LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                }
                output(fit, null, status, CrawlStatus.STATUS_RETRY);
              }

              if (redirecting && redirectCount > maxRedirect) {
View Full Code Here

          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      ProtocolStatus ps = ProtocolStatusUtils.makeStatus(
          ProtocolStatusCodes.EXCEPTION, e.toString());
      return new ProtocolOutput(null, ps);
    }
  }
View Full Code Here

                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                }
              }
            }
            final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
            final ProtocolStatus status = output.getStatus();
            final Content content = output.getContent();
            // unblock queue
            fetchQueues.finishFetchItem(fit);

            context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode())).increment(1);

            int length = 0;
            if (content!=null && content.getContent()!=null) length= content.getContent().length;
            updateStatus(length);

            switch(status.getCode()) {

            case ProtocolStatusCodes.WOULDBLOCK:
              // retry ?
              fetchQueues.addFetchItem(fit);
              break;

            case ProtocolStatusCodes.SUCCESS:        // got a page
              output(fit, content, status, CrawlStatus.STATUS_FETCHED);
              break;

            case ProtocolStatusCodes.MOVED:         // redirect
            case ProtocolStatusCodes.TEMP_MOVED:
              byte code;
              boolean temp;
              if (status.getCode() == ProtocolStatusCodes.MOVED) {
                code = CrawlStatus.STATUS_REDIR_PERM;
                temp = false;
              } else {
                code = CrawlStatus.STATUS_REDIR_TEMP;
                temp = true;
              }
              final String newUrl = ProtocolStatusUtils.getMessage(status);
              handleRedirect(fit.url, newUrl, temp,  FetcherJob.PROTOCOL_REDIR, fit.page);
              output(fit, content, status, code);
              break;
            case ProtocolStatusCodes.EXCEPTION:
              logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
              /* FALLTHROUGH */
            case ProtocolStatusCodes.RETRY:          // retry
            case ProtocolStatusCodes.BLOCKED:
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
              break;

            case ProtocolStatusCodes.GONE:           // gone
            case ProtocolStatusCodes.NOTFOUND:
            case ProtocolStatusCodes.ACCESS_DENIED:
            case ProtocolStatusCodes.ROBOTS_DENIED:
              output(fit, null, status, CrawlStatus.STATUS_GONE);
              break;

            case ProtocolStatusCodes.NOTMODIFIED:
              output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
              break;

            default:
              if (LOG.isWarnEnabled()) {
                LOG.warn("Unknown ProtocolStatus: " + status.getCode());
              }
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
            }

          } catch (final Throwable t) {                 // unexpected exception
View Full Code Here

                  Bytes.toStringBinary(entry.getValue().array()));
            }
          }
          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.ProtocolStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.