Source Code of org.apache.nutch.parse.html.HtmlParser

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.parse.html;


import java.util.ArrayList;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.*;
import java.util.regex.*;


import org.cyberneko.html.parsers.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
import org.apache.html.dom.*;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.*;


public class HtmlParser implements Parser {
  public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html");


  // I used 1000 bytes at first, but  found that some documents have 
  // meta tag well past the first 1000 bytes. 
  // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
  private static final int CHUNK_SIZE = 2000;
  private static Pattern metaPattern =
    Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
                    Pattern.CASE_INSENSITIVE);
  private static Pattern charsetPattern =
    Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
                    Pattern.CASE_INSENSITIVE);
  
  private String parserImpl;


  /**
   * Given a <code>byte[]</code> representing an html file of an 
   * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag   
   * from the first <code>CHUNK_SIZE</code> bytes.
   * If there's no meta tag for Content-Type or no charset is specified,
   * <code>null</code> is returned.  <br />
   * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
   * can't be handled with this. 
   * We need to do something similar to what's done by mozilla
   * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
   * See also http://www.w3.org/TR/REC-xml/#sec-guessing
   * <br />
   *
   * @param content <code>byte[]</code> representation of an html file
   */


  private static String sniffCharacterEncoding(byte[] content) {
    int length = content.length < CHUNK_SIZE ? 
                 content.length : CHUNK_SIZE;


    // We don't care about non-ASCII parts so that it's sufficient
    // to just inflate each byte to a 16-bit value by padding. 
    // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into 
    // {U+0041, U+0082, U+00B7}. 
    String str = new String(content, 0, 0, length); 


    Matcher metaMatcher = metaPattern.matcher(str);
    String encoding = null;
    if (metaMatcher.find()) {
      Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
      if (charsetMatcher.find()) 
        encoding = new String(charsetMatcher.group(1));
    }


    return encoding;
  }




  private String defaultCharEncoding;


  private Configuration conf;
  
  private DOMContentUtils utils;


  private HtmlParseFilters htmlParseFilters;
  
  private String cachingPolicy;
  
  public Parse getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();


    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParse(getConf());
    }


    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();


    // parse the content
    DocumentFragment root;
    try {
      byte[] contentInOctets = content.getContent();
      InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
      String contentType = content.getMetadata().get(Response.CONTENT_TYPE);
      String encoding = StringUtil.parseCharacterEncoding(contentType);
      if ((encoding != null) && !("".equals(encoding))) {
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
          metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
          if (LOG.isTraceEnabled()) {
            LOG.trace(base + ": setting encoding to " + encoding);
          }
        }
      }


      // sniff out 'charset' value from the beginning of a document
      if ((encoding == null) || ("".equals(encoding))) {
        encoding = sniffCharacterEncoding(contentInOctets);
        if (encoding!=null) {
          metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
          if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
            metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
            if (LOG.isTraceEnabled()) {
              LOG.trace(base + ": setting encoding to " + encoding);
            }
          }
        }
      }


      if (encoding == null) {
        // fallback encoding.
        // FIXME : In addition to the global fallback value,
        // we should make it possible to specify fallback encodings for each ccTLD.
        // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
        // doesn't work for jp because euc-jp and shift_jis have about the
        // same share)
        encoding = defaultCharEncoding;
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, defaultCharEncoding);
        if (LOG.isTraceEnabled()) {
          LOG.trace(base + ": falling back to " + defaultCharEncoding);
        }
      }
      input.setEncoding(encoding);
      if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
      root = parse(input);
    } catch (IOException e) {
      return new ParseStatus(e).getEmptyParse(getConf());
    } catch (DOMException e) {
      return new ParseStatus(e).getEmptyParse(getConf());
    } catch (SAXException e) {
      return new ParseStatus(e).getEmptyParse(getConf());
    } catch (Exception e) {
      e.printStackTrace(LogUtil.getWarnStream(LOG));
      return new ParseStatus(e).getEmptyParse(getConf());
    }
      
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {               // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
      utils.getText(sb, root);          // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
      utils.getTitle(sb, root);         // extract title
      title = sb.toString().trim();
    }
      
    if (!metaTags.getNoFollow()) {              // okay to follow links
      ArrayList l = new ArrayList();              // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
      utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
      outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found "+outlinks.length+" outlinks in "+content.getUrl());
      }
    }
    
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setMessage(metaTags.getRefreshHref().toString());
    }
    ParseData parseData = new ParseData(status, title, outlinks,
                                        content.getMetadata(), metadata);
    parseData.setConf(this.conf);
    Parse parse = new ParseImpl(text, parseData);


    // run filters on parse
    parse = this.htmlParseFilters.filter(content, parse, metaTags, root);
    if (metaTags.getNoCache()) {             // not okay to cache
      parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return parse;
  }


  private DocumentFragment parse(InputSource input) throws Exception {
    if (parserImpl.equalsIgnoreCase("tagsoup"))
      return parseTagSoup(input);
    else return parseNeko(input);
  }
  
  private DocumentFragment parseTagSoup(InputSource input) throws Exception {
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    DocumentFragment frag = doc.createDocumentFragment();
    DOMBuilder builder = new DOMBuilder(doc, frag);
    org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
    reader.setContentHandler(builder);
    reader.setFeature(reader.ignoreBogonsFeature, true);
    reader.setFeature(reader.bogonsEmptyFeature, false);
    reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
    reader.parse(input);
    return frag;
  }
  
  private DocumentFragment parseNeko(InputSource input) throws Exception {
    DOMFragmentParser parser = new DOMFragmentParser();
    // some plugins, e.g., creativecommons, need to examine html comments
    try {
      parser.setFeature("http://apache.org/xml/features/include-comments", 
              true);
      parser.setFeature("http://apache.org/xml/features/augmentations", 
              true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
              false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
              true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors",
              true);
    } catch (SAXException e) {}
    // convert Document to DocumentFragment
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment res = doc.createDocumentFragment();
    DocumentFragment frag = doc.createDocumentFragment();
    parser.parse(input, frag);
    res.appendChild(frag);
    
    try {
      while(true) {
        frag = doc.createDocumentFragment();
        parser.parse(input, frag);
        if (!frag.hasChildNodes()) break;
        if (LOG.isInfoEnabled()) {
          LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
        }
        res.appendChild(frag);
      }
    } catch (Exception x) { x.printStackTrace(LogUtil.getWarnStream(LOG));};
    return res;
  }
  
  public static void main(String[] args) throws Exception {
    //LOG.setLevel(Level.FINE);
    String name = args[0];
    String url = "file:"+name;
    File file = new File(name);
    byte[] bytes = new byte[(int)file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    HtmlParser parser = new HtmlParser();
    parser.setConf(conf);
    Parse parse = parser.getParse(
            new Content(url, url, bytes, "text/html", new Metadata(), conf));
    System.out.println("data: "+parse.getData());


    System.out.println("text: "+parse.getText());
    
  }


  public void setConf(Configuration conf) {
    this.conf = conf;
    this.htmlParseFilters = new HtmlParseFilters(getConf());
    this.parserImpl = getConf().get("parser.html.impl", "neko");
    this.defaultCharEncoding = getConf().get(
        "parser.character.encoding.default", "windows-1252");
    this.utils = new DOMContentUtils(conf);
    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
        Nutch.CACHING_FORBIDDEN_CONTENT);
  }


  public Configuration getConf() {
    return this.conf;
  }
}
Source Code of org.apache.nutch.parse.html.HtmlParser

Related Classes of org.apache.nutch.parse.html.HtmlParser