Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Outlink


            if (target != null && !noFollow && !post)
              try {
               
                URL url = (base.toString().indexOf(';') > 0) ?
                  fixEmbeddedParams(base, target) new URL(base, target);
                outlinks.add(new Outlink(url.toString(),
                                         linkText.toString().trim()));
              } catch (MalformedURLException e) {
                // don't care
              }
          }
View Full Code Here


      testDOMs[i]= node;
    }
    try {
     answerOutlinks = new Outlink[][]{
         {
           new Outlink("http://www.nutch.org", "anchor"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
         },
         {
           new Outlink("http://www.nutch.org/", "separate this"),
           new Outlink("http://www.nutch.org/docs/ok", "from this"),
         },
         {
           new Outlink("http://www.nutch.org/", "home"),
           new Outlink("http://www.nutch.org/docs/1", "1"),
           new Outlink("http://www.nutch.org/docs/2", "2"),
         },
         {
           new Outlink("http://www.nutch.org/frames/top.html", ""),
           new Outlink("http://www.nutch.org/frames/left.html", ""),
           new Outlink("http://www.nutch.org/frames/invalid.html", ""),
           new Outlink("http://www.nutch.org/frames/right.html", ""),
         },
         {
           new Outlink("http://www.nutch.org/maps/logo.gif", ""),
           new Outlink("http://www.nutch.org/index.html", ""),
           new Outlink("http://www.nutch.org/maps/#bottom", ""),
           new Outlink("http://www.nutch.org/bot.html", ""),
           new Outlink("http://www.nutch.org/docs/index.html", ""),
         },
         {
             new Outlink("http://www.nutch.org/index.html", "whitespace test"),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
         },
         {
         },
         {
           new Outlink("http://www.nutch.org/;x", "anchor1"),
           new Outlink("http://www.nutch.org/g;x", "anchor2"),
           new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
         },
         {
           new Outlink("http://www.nutch.org/g;something", "anchor1"),
           new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
           new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
           new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
           new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5")
         }
      };
  
    } catch (MalformedURLException e) {
       
View Full Code Here

      fromUrlCriginalColectionName=SqlSearcher.getCollectionNameOriginal(collection);     
      fromUrlTimestamp=SqlSearcher.getTimestampOriginal(collection);     
    }
   
    for (int i = 0; i < outlinks.length; i++) {
      Outlink outlink = outlinks[i];
      String toUrl = outlink.getToUrl();
          
      if (this.nwIgnoreInternalLinks)
      {
        String toHost = getHost(toUrl);
       
        if (toHost == null || toHost.equals(fromHost)) { // internal link             
          continue;                               // skip it
        }
      }

      if (this.nwUrlNormalizers != null)
      {
        try {         
          toUrl = this.nwUrlNormalizers. normalize(toUrl, URLNormalizers.SCOPE_LINKDB);
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
     
      if (toUrl != null && this.nwUrlFilters != null) {
        try {
          toUrl = this.nwUrlFilters.filter(toUrl); // filter the url
          if (toUrl==null) { 
            LOG.info("LINKDB URL FILTERED")
          }
        }
        catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
   
      if (toUrl == null) {
        continue;
      }

      inlinks.clear();
   
      String anchor = outlink.getAnchor();        // truncate long anchors
      
      if (anchor.length() > this.nwMaxAnchorLength) {
        anchor = anchor.substring(0, this.nwMaxAnchorLength);
      }
View Full Code Here

            }
            if (target != null && !noFollow && !post)
              try {
               
                URL url = URLUtil.resolveURL(base, target);
                outlinks.add(new Outlink(url.toString(),
                                         linkText.toString().trim()));
              } catch (MalformedURLException e) {
                // don't care
              }
          }
View Full Code Here

      testDOMs[i]= node;
    }
    try {
      answerOutlinks = new Outlink[][]{
          {
            new Outlink("http://www.nutch.org", "anchor"),
          },
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
          },
          {
            new Outlink("http://www.nutch.org/", "separate this"),
            new Outlink("http://www.nutch.org/docs/ok", "from this"),
          },
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/1", "1"),
            new Outlink("http://www.nutch.org/docs/2", "2"),
          },
          {
            new Outlink("http://www.nutch.org/frames/top.html", ""),
            new Outlink("http://www.nutch.org/frames/left.html", ""),
            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
            new Outlink("http://www.nutch.org/frames/right.html", ""),
          },
          {
            new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),
            new Outlink("http://www.nutch.org/docs/index.html", ""),
          },
          {
            new Outlink("http://www.nutch.org/index.html", "whitespace test"),
          },
          {
          },
          {
            new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
          },
          {
          },
          {
            new Outlink("http://www.nutch.org/;x", "anchor1"),
            new Outlink("http://www.nutch.org/g;x", "anchor2"),
            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
          },
          {
            // this is tricky - see RFC3986 section 5.4.1 example 7
            new Outlink("http://www.nutch.org/g", "anchor1"),
            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
            new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
          },
          {
            new Outlink("http://www.nutch.org/g", ""),
            new Outlink("http://www.nutch.org/g1", ""),
            new Outlink("http://www.nutch.org/g2", "bla bla"),
            new Outlink("http://www.nutch.org/test.gif", "bla bla"),
          }
      };

    } catch (MalformedURLException e) {
View Full Code Here

            Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
            ParseData theParseData = parse.getData();
            Outlink[] theOutlinks = theParseData.getOutlinks();
           
            for(int count = 0; count < theOutlinks.length; count++) {
              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
            }
           
            resultText += entry.getName() + " " + parse.getText() + " ";
          } catch (ParseException e) {
            if (LOG.isInfoEnabled()) {
View Full Code Here

    Assert.assertNotNull(filter);

    NutchDocument doc = new NutchDocument();

    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
View Full Code Here

        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();

        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());

            if (filterUrl(toUrl) == null) {
              continue;
            }

            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
              && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());
            }
          }
        }

        // collect the outlinks under the fetch time
View Full Code Here

        Assert.assertTrue("caught exception: " + e, false);
      }
      testDOMs[i] = node;
    }
    answerOutlinks = new Outlink[][] {
        { new Outlink("http://www.nutch.org", "anchor"), },
        { new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
          { new Outlink("http://www.nutch.org/", "separate this"),
            new Outlink("http://www.nutch.org/docs/ok", "from this"), },
            { new Outlink("http://www.nutch.org/", "home"),
              new Outlink("http://www.nutch.org/docs/1", "1"),
              new Outlink("http://www.nutch.org/docs/2", "2"), },
              { new Outlink("http://www.nutch.org/frames/top.html", ""),
                new Outlink("http://www.nutch.org/frames/left.html", ""),
                new Outlink("http://www.nutch.org/frames/invalid.html", ""),
                new Outlink("http://www.nutch.org/frames/right.html", ""), },
                { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
                  new Outlink("http://www.nutch.org/index.html", ""),
                  new Outlink("http://www.nutch.org/maps/#bottom", ""),
                  new Outlink("http://www.nutch.org/bot.html", ""),
                  new Outlink("http://www.nutch.org/docs/index.html", ""), },
                  { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
                  {},
                  { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
                  {},
                  { new Outlink("http://www.nutch.org/;x", "anchor1"),
                    new Outlink("http://www.nutch.org/g;x", "anchor2"),
                    new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
                    {
                      // this is tricky - see RFC3986 section 5.4.1 example 7
                      new Outlink("http://www.nutch.org/g", "anchor1"),
                      new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
                      new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
                      new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
                      new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
                          "anchor5") } };

  }
View Full Code Here

        }
        url = url.replaceAll("&amp;", "&");
        if (LOG.isTraceEnabled()) {
          LOG.trace(" - outlink from JS: '" + url + "'");
        }
        outlinks.add(new Outlink(url, anchor));
      }
    } catch (Exception ex) {
      // if it is a malformed URL we just throw it away and continue with
      // extraction.
      if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Outlink

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.