Examples of HTMLParser


Examples of org.apache.lenya.lucene.html.HTMLParser

     * @return the content of the file.
     * @throws FileNotFoundException if the file does not exists.
     * @throws IOException if something else went wrong.
     */
    protected String readHtmlFile(File file) throws FileNotFoundException, IOException {
        java.io.Reader reader = new HTMLParser(file).getReader();
        char[] chars = new char[1024];
        int chars_read;
        java.io.Writer writer = new java.io.StringWriter();

        while ((chars_read = reader.read(chars)) > 0) {
View Full Code Here

Examples of org.apache.lenya.lucene.html.HTMLParser

        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        doc.add(new Field("uid", uid(f, htdocsDumpDir), false, true, false));

        //HtmlDocument htmlDoc = new HtmlDocument(f);
        HTMLParser parser = new HTMLParser(f);

        // Add the summary as an UnIndexed field, so that it is stored and returned
        // with hit documents for display.
        // Add the title as a separate Text field, so that it can be searched separately.
        /*
                String title = htmlDoc.getTitle();

                if (title != null) {
                    doc.add(Field.Text("title", title));
                } else {
                    doc.add(Field.Text("title", ""));
                }
        */
        doc.add(Field.Text("title", parser.getTitle()));

        //System.out.println("HTMLDocument.getLuceneDocument(): title field added: " + title);
        // Add the tag-stripped contents as a Reader-valued Text field so it will get tokenized and indexed.
        /*
                String body = htmlDoc.getBody();
                String contents = "";

                if ((body != null) && (title != null)) {
                    contents = title + " " + body;
                    doc.add(Field.Text("contents", title + body));
                }

                doc.add(Field.Text("contents", contents));
        */
        doc.add(Field.Text("contents", parser.getReader()));

        return doc;
    }
View Full Code Here

Examples of org.apache.lenya.lucene.parser.HTMLParser

     * @return DOCUMENT ME!
     *
     * @throws Exception DOCUMENT ME!
     */
    public static String getBodyText(File file) throws Exception {
        HTMLParser parser = HTMLParserFactory.newInstance(file);
        parser.parse(file);

        Reader reader = parser.getReader();
        Writer writer = new StringWriter();

        int c;

        while ((c = reader.read()) != -1)
View Full Code Here

Examples of org.apache.lenya.lucene.parser.HTMLParser

     * @throws Exception DOCUMENT ME!
     */
    public Document getDocument(File file, File htdocsDumpDir) throws Exception {
        Document document = super.getDocument(file, htdocsDumpDir);

        HTMLParser parser = HTMLParserFactory.newInstance(file);
        parser.parse(file);

        document.add(Field.Text("title", parser.getTitle()));
        document.add(Field.Text("keywords", parser.getKeywords()));
        document.add(Field.Text("contents", parser.getReader()));

        return document;
    }
View Full Code Here

Examples of org.apache.lucene.demo.html.HTMLParser

    // Add the uid as a field, so that index can be incrementally maintained.
    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), false, true, false));

    HTMLParser parser = new HTMLParser(f);

    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(Field.Text("contents", parser.getReader()));

    // Add the summary as an UnIndexed field, so that it is stored and returned
    // with hit documents for display.
    doc.add(Field.UnIndexed("summary", parser.getSummary()));

    // Add the title as a separate Text field, so that it can be searched
    // separately.
    doc.add(Field.Text("title", parser.getTitle()));

    // return the document
    return doc;
  }
View Full Code Here

Examples of org.apache.lucene.demo.html.HTMLParser

    // 5. skip until end of doc header
    read("</DOCHDR>",null,false,false);
    // 6. collect until end of doc
    sb = read("</DOC>",null,false,true);
    // this is the next document, so parse it 
    HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
    // title
    String title = p.getTitle();
    // properties
    Properties props = p.getMetaTags();
    // body
    Reader r = p.getReader();
    char c[] = new char[1024];
    StringBuffer bodyBuf = new StringBuffer();
    int n;
    while ((n = r.read(c)) >= 0) {
      if (n>0) {
View Full Code Here

Examples of org.apache.lucene.demo.html.HTMLParser

    // This field is not stored with document, it is indexed, but it is not
    // tokenized prior to indexing.
    doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.UN_TOKENIZED));

    FileInputStream fis = new FileInputStream(f);
    HTMLParser parser = new HTMLParser(fis);
     
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
    doc.add(new Field("contents", parser.getReader()));

    // Add the summary as a field that is stored and returned with
    // hit documents for display.
    doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));

    // Add the title as a field that it can be searched and that is stored.
    doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.TOKENIZED));

    // return the document
    return doc;
  }
View Full Code Here

Examples of org.apache.nutch.parse.html.HtmlParser

  private Configuration conf;
  private Parser parser;

  public TestHtmlParser() {
    conf = NutchConfiguration.create();
    parser = new HtmlParser();
    parser.setConf(conf);
  }
View Full Code Here

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser

        }
        catch (InitializationException e) {
            LOG.error("Registry Initialization Error: " + e.getMessage());
            throw new IOException(e.getMessage());
        }
        parser = new HtmlParser();

    }
View Full Code Here

Examples of org.apache.tika.parser.html.HtmlParser

    super(UrlDatum.FIELDS);
  }

  private synchronized void init() {
    if (_parser == null) {
      _parser = new HtmlParser();
    }
   
    if (_handler == null) {
      _handler = new DefaultHandler() {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.