Package org.htmlparser.parserapplications

Examples of org.htmlparser.parserapplications.StringExtractor


    doc.setAuthor(author); doc.setMetadata(metadata);
   } // eod of if

   //*-- Populate the contents of the contents with the entire text from the  web page
   logger.info("Extracting text from body of html file " + ifile);
   StringExtractor st = new StringExtractor(ifile);

   //*-- string extractor does not input form values -- handle separately
   parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
   NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
   for (int i = 0; i < nodelist3.size(); i++)
   { InputTag itag = (InputTag) nodelist3.elementAt(i);
   if ((itag != null) && (itag.getAttribute("value") != null) )
   { inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
   }

   //*-- finally set the contents of the document
   doc.setContents( new StringBuffer(cleanHTML( st.extractStrings(false)) + " " + inputVal) );
   doc.setFileName(ifile);

  } //*-- end of try block
  catch (OutOfMemoryError exc)
  { logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage()); }
View Full Code Here

TOP

Related Classes of org.htmlparser.parserapplications.StringExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.