Examples of Outlink


Examples of bixo.datum.Outlink

    }

    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
        ParsedDatum datum = new ParsedDatum(funcCall.getArguments());
        Outlink outlinks[] = datum.getOutlinks();

        // Bump the crawl depth value only on a successful parse
        int crawlDepth = (Integer) datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH);
        datum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, crawlDepth + 1);
View Full Code Here

Examples of bixo.datum.Outlink

    @SuppressWarnings("rawtypes")
    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
       
        AnalyzedDatum datum = new AnalyzedDatum(funcCall.getArguments().getTuple());
        Outlink outlinks[] = datum.getOutlinks();

        TupleEntryCollector collector = funcCall.getOutputCollector();

        if (outlinks.length > 0) {
            float pageScore = datum.getPageScore();
View Full Code Here

Examples of bixo.datum.Outlink

       
        for (Node node : aNodes) {
            String url = getAttributeFromNode(node, "href");
            String anchor = getAttributeFromNode(node, "name");
            String rel = getAttributeFromNode(node, "rel");
            Outlink link = new Outlink(url, anchor, rel);
            outlinkList.add(link);
        }
   
        return outlinkList.toArray(new Outlink[outlinkList.size()]);
    }
View Full Code Here

Examples of bixo.datum.Outlink

    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
        super.endElement(uri, localName, name);

        if (localName.equalsIgnoreCase(_inAnchorTag)) {
            addLink(new Outlink(_curUrl, _curAnchor.toString(), _curRelAttributes));
            _inAnchorTag = null;
        }
    }
View Full Code Here

Examples of net.nutch.parse.Outlink

      setAlbum(value);
    if (name.equals("TPE1-Text"))
      setArtist(value);

    if (name.indexOf("URL Link") > -1) {
      links.add(new Outlink(value, ""));
    } else if (name.indexOf("Text") > -1) {
      text += value + "\n";
    }

    metadata.setProperty(name, value);
View Full Code Here

Examples of net.nutch.parse.Outlink

            }
          }
          if (target != null)
            try {
              URL url = new URL(base, target);
              outlinks.add(new Outlink(url.toString(),
                                       linkText.toString().trim()));
            } catch (MalformedURLException e) {
              // don't care
            }
        }
View Full Code Here

Examples of net.nutch.parse.Outlink

      testDOMs[i]= node;
    }
    try {
     answerOutlinks = new Outlink[][]{
         {
           new Outlink("http://www.nutch.org", "anchor"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
         },
         {
           new Outlink("http://www.nutch.org/", "separate this"),
           new Outlink("http://www.nutch.org/docs/ok", "from this"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/1", "1"),
           new Outlink("http://www.nutch.org/docs/2", "2"),
         },
         {
           new Outlink("http://www.nutch.org/frames/top.html", ""),
           new Outlink("http://www.nutch.org/frames/left.html", ""),
           new Outlink("http://www.nutch.org/frames/invalid.html", ""),
           new Outlink("http://www.nutch.org/frames/right.html", ""),
         },
         {
           new Outlink("http://www.nutch.org/index.html", ""),
           new Outlink("http://www.nutch.org/maps/#bottom", ""),
           new Outlink("http://www.nutch.org/bot.html", ""),
           new Outlink("http://www.nutch.org/docs/index.html", ""),
         },
         {
             new Outlink("http://www.nutch.org/index.html", "whitespace test"),
         },
      };
  
    } catch (MalformedURLException e) {
       
View Full Code Here

Examples of org.apache.nutch.parse.Outlink

            if (target != null && !noFollow && !post)
              try {
               
                URL url = (base.toString().indexOf(';') > 0) ?
                  fixEmbeddedParams(base, target) new URL(base, target);
                outlinks.add(new Outlink(url.toString(),
                                         linkText.toString().trim()));
              } catch (MalformedURLException e) {
                // don't care
              }
          }
View Full Code Here

Examples of org.apache.nutch.parse.Outlink

      testDOMs[i]= node;
    }
    try {
     answerOutlinks = new Outlink[][]{
         {
           new Outlink("http://www.nutch.org", "anchor"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
         },
         {
           new Outlink("http://www.nutch.org/", "separate this"),
           new Outlink("http://www.nutch.org/docs/ok", "from this"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/1", "1"),
           new Outlink("http://www.nutch.org/docs/2", "2"),
         },
         {
           new Outlink("http://www.nutch.org/frames/top.html", ""),
           new Outlink("http://www.nutch.org/frames/left.html", ""),
           new Outlink("http://www.nutch.org/frames/invalid.html", ""),
           new Outlink("http://www.nutch.org/frames/right.html", ""),
         },
         {
           new Outlink("http://www.nutch.org/maps/logo.gif", ""),
           new Outlink("http://www.nutch.org/index.html", ""),
           new Outlink("http://www.nutch.org/maps/#bottom", ""),
           new Outlink("http://www.nutch.org/bot.html", ""),
           new Outlink("http://www.nutch.org/docs/index.html", ""),
         },
         {
             new Outlink("http://www.nutch.org/index.html", "whitespace test"),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/;x", "anchor1"),
           new Outlink("http://www.nutch.org/g;x", "anchor2"),
           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
         },
         {
           new Outlink("http://www.nutch.org/g;something", "anchor1"),
           new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
           new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
           new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5")
         }
      };
  
    } catch (MalformedURLException e) {
       
View Full Code Here

Examples of org.apache.nutch.parse.Outlink

      fromUrlCriginalColectionName=SqlSearcher.getCollectionNameOriginal(collection);     
      fromUrlTimestamp=SqlSearcher.getTimestampOriginal(collection);     
    }
   
    for (int i = 0; i < outlinks.length; i++) {
      Outlink outlink = outlinks[i];
      String toUrl = outlink.getToUrl();
          
      if (this.nwIgnoreInternalLinks)
      {
        String toHost = getHost(toUrl);
       
        if (toHost == null || toHost.equals(fromHost)) { // internal link             
          continue;                               // skip it
        }
      }

      if (this.nwUrlNormalizers != null)
      {
        try {         
          toUrl = this.nwUrlNormalizers. normalize(toUrl, URLNormalizers.SCOPE_LINKDB);
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
     
      if (toUrl != null && this.nwUrlFilters != null) {
        try {
          toUrl = this.nwUrlFilters.filter(toUrl); // filter the url
          if (toUrl==null) { 
            LOG.info("LINKDB URL FILTERED")
          }
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
   
      if (toUrl == null) {
        continue;
      }

      inlinks.clear();
   
      String anchor = outlink.getAnchor();        // truncate long anchors
      
      if (anchor.length() > this.nwMaxAnchorLength) {
        anchor = anchor.substring(0, this.nwMaxAnchorLength);
      }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.