Package org.apache.nutch.tika

Source Code of org.apache.nutch.tika.TestRobotsMetaProcessor

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.tika;

import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.tika.HTMLMetaProcessor;

import java.io.ByteArrayInputStream;
import java.net.URL;

import org.xml.sax.*;
import org.w3c.dom.*;
import org.apache.html.dom.*;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.junit.Assert;
import org.junit.Test;

/** Unit tests for HTMLMetaProcessor. */
public class TestRobotsMetaProcessor {

  /*

  some sample tags:

  <meta name="robots" content="index,follow">
  <meta name="robots" content="noindex,follow">
  <meta name="robots" content="index,nofollow">
  <meta name="robots" content="noindex,nofollow">

  <META HTTP-EQUIV="Pragma" CONTENT="no-cache">

   */


  public static String[] tests=
    {
    "<html><head><title>test page</title>"
        + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
        + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
        + "</head><body>"
        + " some text"
        + "</body></html>",

        "<html><head><title>test page</title>"
            + "<meta name=\"robots\" content=\"all\"> "
            + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
            + "</head><body>"
            + " some text"
            + "</body></html>",

            "<html><head><title>test page</title>"
                + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
                + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
                + "</head><body>"
                + " some text"
                + "</body></html>",

                "<html><head><title>test page</title>"
                    + "<meta name=\"robots\" content=\"none\"> "
                    + "</head><body>"
                    + " some text"
                    + "</body></html>",

                    "<html><head><title>test page</title>"
                        + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
                        + "</head><body>"
                        + " some text"
                        + "</body></html>",

                        "<html><head><title>test page</title>"
                            + "<meta name=\"robots\" content=\"noindex,follow\"> "
                            + "</head><body>"
                            + " some text"
                            + "</body></html>",

                            "<html><head><title>test page</title>"
                                + "<meta name=\"robots\" content=\"index,nofollow\"> "
                                + "</head><body>"
                                + " some text"
                                + "</body></html>",

                                "<html><head><title>test page</title>"
                                    + "<meta name=\"robots\" content=\"index,follow\"> "
                                    + "<base href=\"http://www.nutch.org/\">"
                                    + "</head><body>"
                                    + " some text"
                                    + "</body></html>",

                                    "<html><head><title>test page</title>"
                                        + "<meta name=\"robots\"> "
                                        + "<base href=\"http://www.nutch.org/base/\">"
                                        + "</head><body>"
                                        + " some text"
                                        + "</body></html>",

    };

  public static final boolean[][] answers= {
    {true, true, true},     // NONE
    {false, false, true},   // all
    {true, true, true},     // nOnE
    {true, true, false},    // none
    {true, true, false},    // noindex,nofollow
    {true, false, false},   // noindex,follow
    {false, true, false},   // index,nofollow
    {false, false, false}// index,follow
    {false, false, false}// missing!
  };

  private URL[][] currURLsAndAnswers;

  @Test
  public void testRobotsMetaProcessor() {
    DOMFragmentParser parser= new DOMFragmentParser();;

    try {
      currURLsAndAnswers= new URL[][] {
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org/foo/"),
            new URL("http://www.nutch.org/")},
            {new URL("http://www.nutch.org"),
              new URL("http://www.nutch.org/base/")}
      };
    } catch (Exception e) {
      Assert.assertTrue("couldn't make test URLs!", false);
    }

    for (int i= 0; i < tests.length; i++) {
      byte[] bytes= tests[i].getBytes();

      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();

      try {
        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
      } catch (Exception e) {
        e.printStackTrace();
      }

      HTMLMetaTags robotsMeta= new HTMLMetaTags();
      HTMLMetaProcessor.getMetaTags(robotsMeta, node,
          currURLsAndAnswers[i][0]);

      Assert.assertTrue("got index wrong on test " + i,
          robotsMeta.getNoIndex() == answers[i][0]);
      Assert.assertTrue("got follow wrong on test " + i,
          robotsMeta.getNoFollow() == answers[i][1]);
      Assert.assertTrue("got cache wrong on test " + i,
          robotsMeta.getNoCache() == answers[i][2]);
      Assert.assertTrue("got base href wrong on test " + i + " (got "
          + robotsMeta.getBaseHref() + ")",
          ( (robotsMeta.getBaseHref() == null)
              && (currURLsAndAnswers[i][1] == null) )
              || ( (robotsMeta.getBaseHref() != null)
                  && robotsMeta.getBaseHref().equals(
                      currURLsAndAnswers[i][1]) ) );

    }
  }

}
TOP

Related Classes of org.apache.nutch.tika.TestRobotsMetaProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.