Package cascading.tuple

Examples of cascading.tuple.TupleEntry


    }
  }
 
  @Override
  public void operate(FlowProcess process, FunctionCall<NullContext> functionCall) {
        TupleEntry arguments = functionCall.getArguments();
        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());

        if (fetchedDatum.getContentType().startsWith("text/html")) {
          init();

          Metadata metadata = new Metadata();
View Full Code Here


       
        // Test for all valid fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            StatusDatum sd = new StatusDatum(entry);
            if (sd.getStatus() != UrlStatus.FETCHED) {
                LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl()));
                BaseFetchException e = sd.getException();
                if (e != null) {
View Full Code Here

        // Test for 10 good fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int fetchedPages = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            new FetchedDatum(entry);
            fetchedPages += 1;
        }

        Assert.assertEquals(10, fetchedPages);
View Full Code Here

        float linkScore = 0;

        String url = null;

        while (iter.hasNext()) {
            TupleEntry entry = iter.next();
           
            boolean isCrawlDatum = entry.getString(CRAWLDBDATUM_URL_FIELD) != null;
            boolean isStatus = entry.getString(STATUSDATUM_URL_FIELD) != null;
            boolean isAnalyzed = entry.getString(ANALYZEDDATUM_URL_FIELD) != null;
            if (isCrawlDatum) {
               Tuple crawlDbTuple = TupleEntry.select(CrawlDbDatum.FIELDS, entry);
               crawlDbDatum = new CrawlDbDatum(crawlDbTuple);
               url = crawlDbDatum.getUrl();
            }
           
            if (isStatus) {
                statusDatum = new StatusDatum(entry);
                url = statusDatum.getUrl();
            }

            if (isAnalyzed) {
                Tuple analyzedTuple = TupleEntry.select(AnalyzedDatum.FIELDS, entry);
                analyzedDatum = new AnalyzedDatum(analyzedTuple);
                url = analyzedDatum.getUrl();
            }

            // we could have either status + link or just link tuple entry
            if (entry.getString(new Fields(LinkDatum.URL_FN)) != null) {
                LinkDatum linkDatum = new LinkDatum(TupleEntry.select(LinkDatum.FIELDS, entry));
               
                pageScore = linkDatum.getPageScore();
                // Add up the link scores
                linkScore += linkDatum.getLinkScore();
View Full Code Here

  @Test
  public void testSplitterWithNonMbox() {
    MboxSplitterFunction splitter = new MboxSplitterFunction();
   
    FetchedDatum datum = new FetchedDatum("baseUrl", "redirectedUrl", 0, new HttpHeaders(), new ContentBytes(), "text/ascii", 0);
    TupleEntry value = new TupleEntry(datum.getTupleEntry());
   
    when(_funcCall.getArguments()).thenReturn(value);
    splitter.operate(_process, _funcCall);
   
    verify(_collector).add(value);
View Full Code Here

    MboxSplitterFunction splitter = new MboxSplitterFunction();

    final String mboxString = "From 1\r\rContent 1\r\rFrom 2\r\rContent 2";
    byte[] mboxContent = mboxString.getBytes("us-ascii");
    FetchedDatum datum = new FetchedDatum("baseUrl", "redirectedUrl", 0, new HttpHeaders(), new ContentBytes(mboxContent), "application/mbox", 0);
    TupleEntry value = new TupleEntry(datum.getTupleEntry());
   
    when(_funcCall.getArguments()).thenReturn(value);
    splitter.operate(_process, _funcCall);

    verify(_collector, times(2)).add(any(TupleEntry.class));
View Full Code Here

  @Override
  public void operate(FlowProcess process, FunctionCall<NullContext> functionCall) {
    init();
   
    // On input we have a FetchedDatum that holds a single email.
        TupleEntry arguments = functionCall.getArguments();
        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());
       
        // Now, if the FetchedDatum mime-type is application/mbox, we want to parse it and
        // output the results
        if (fetchedDatum.getContentType().equals("application/mbox")) {
          Metadata metadata = new Metadata();
View Full Code Here

    HashSet<String> names = new HashSet<String>();
       
    Iterator<TupleEntry> iter = bufferCall.getArgumentsIterator();
    while (iter.hasNext()) {
      TupleEntry entry = iter.next();
     
      double score = entry.getDouble(FieldNames.SCORE);
      summedScore += score;
     
      String name = entry.getString(FieldNames.EMAIL_NAME);
      if (name != null) {
        name = name.trim();
        if (name.startsWith("\"") && name.endsWith("\"")) {
          name = name.substring(1, name.length() - 1);
        }
View Full Code Here

    String name = null;
    double score = 0.0;
   
    Iterator<TupleEntry> iter = bufferCall.getArgumentsIterator();
    while (iter.hasNext()) {
      TupleEntry entry = iter.next();
     
      String possibleEmail = entry.getString(FieldNames.EMAIL_ADDRESS);
      if (possibleEmail != null) {
        if (email != null) {
          if (!email.equals(possibleEmail)) {
            LOGGER.warn(String.format("Duplicate entry found for email addresses (%s - %s | %s)",
                entry.getString(FieldNames.MESSAGE_ID), email, possibleEmail));
          } else {
            // We occasionally get dup message ids because a msg gets archived twice.
            // FUTURE KKr - ignore duplicate message ids. Since they typically occur in order, we
            // could do it in our parse handling code.
          }
        } else {
          email = possibleEmail;
          name = entry.getString(FieldNames.EMAIL_NAME);
        }
      }
     
      score += entry.getDouble(FieldNames.SCORE);
    }
   
    if (email != null) {
      bufferCall.getOutputCollector().add(new Tuple(email, name, score));
    }
View Full Code Here

        boolean allOK = false;
        int verifiedCnt = 0;
        Tap sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
          TupleEntry next = tupleEntryIterator.next();
          String line = next.getString("line");
          String[] split = line.split("\t");
          if (split[0].equals(PAGE1_URL)) {
              allOK |= split[3].equals(PAGE1_SCORE);
              verifiedCnt++;
          } else if (split[0].equals(PAGE2_URL)) {
View Full Code Here

TOP

Related Classes of cascading.tuple.TupleEntry

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.