Examples of HtmlParser


Examples of appl.Portal.Utils.LinkSearch.HtmlParser

        this.setSearchTerm(someSearchWords[0]);

        if(this.buildSearchUrl(someSearchWords)) {
            // create a new instance of the mHtmlParser with
            // the searchenginespecific parameters
            mHtmlParser = new HtmlParser();
            mHtmlParser.setRegExpFrame(mRegExpFrame);
            mHtmlParser.setRegExpItemSet(mRegExpItemSet);
            mHtmlParser.setRegExpItem(mRegExpItem);
            mHtmlParser.setNames(mNames);
View Full Code Here

Examples of br.com.caelum.tubaina.parser.html.HtmlParser

        LOG.warn(e.getMessage());
      }
    }

    if (html) {
      HtmlParser htmlParser = new HtmlParser(conf.read("/regex.properties", "/html.properties"), noAnswer);
      HtmlGenerator generator = new HtmlGenerator(htmlParser, strictXhtml, templateDir);
      File file = new File(outputDir, "html");
      FileUtils.forceMkdir(file);
      try {
        generator.generate(b, file);
View Full Code Here

Examples of br.com.caelum.tubaina.parser.html.desktop.HtmlParser

    public void setUp() throws IOException {
        Configuration cfg = new Configuration();
        cfg.setDirectoryForTemplateLoading(new File(TubainaBuilder.DEFAULT_TEMPLATE_DIR, "kindle"));
        cfg.setObjectWrapper(new BeansWrapper());

        Parser parser = new HtmlParser(new RegexConfigurator().read("/regex.properties",
                "/kindle.properties"));

        partToKindle = new PartToKindle(parser, cfg, new ArrayList<String>());
    }
View Full Code Here

Examples of cn.edu.hfut.dmic.webcollector.parser.HtmlParser

    public Parser createParser(String url, String contentType) throws Exception {
        if (contentType == null) {
            return null;
        }
        if (contentType.contains("text/html")) {
            return new HtmlParser(Config.topN);
        }
        return null;
    }
View Full Code Here

Examples of com.flaptor.util.parser.HtmlParser

        switch (docType) {
            case HTML:
                Config conf = Config.getConfig("crawler.properties");
                String removedXPathElements = conf.getString("HtmlParser.removedXPath");
                String[] separatorTags = conf.getStringArray("HtmlParser.separatorTags");
                parser = new HtmlParser(removedXPathElements,separatorTags);
                break;
            case PDF:
                parser = new PdfParser();
                break;
        }
View Full Code Here

Examples of com.google.dart.engine.html.parser.HtmlParser

      AbstractScanner scanner = new StringScanner(source, content);
      scanner.setPassThroughElements(new String[] {TAG_SCRIPT});
      Token token = scanner.tokenize();
      lineInfo = new LineInfo(scanner.getLineStarts());
      final RecordingErrorListener errorListener = new RecordingErrorListener();
      unit = new HtmlParser(source, errorListener).parse(token, lineInfo);
      unit.accept(new RecursiveXmlVisitor<Void>() {
        @Override
        public Void visitHtmlScriptTagNode(HtmlScriptTagNode node) {
          resolveScriptDirectives(node.getScript(), errorListener);
          return null;
View Full Code Here

Examples of com.google.gwt.thirdparty.streamhtmlparser.HtmlParser

   *
   * @param html the HTML to check.
   * @return true if the provided HTML string is complete.
   */
  public static boolean isCompleteHtml(String html) {
    HtmlParser htmlParser = HtmlParserFactory.createParser();
    try {
      htmlParser.parse(html);
    } catch (ParseException e) {
      return false;
    }
    return htmlParser.getState() == HtmlParser.STATE_TEXT
        && !htmlParser.inJavascript() && !htmlParser.inCss();
  }
View Full Code Here

Examples of com.salas.bb.utils.htmlparser.HtmlParser

    static String process(String aText, int sizeLimit, boolean html)
    {
        if (aText == null) return null;

        IHtmlParserListener listener;
        HtmlParser parser = new HtmlParser(true);

        StringBuilderListener bufListener = new StringBuilderListener(aText.length(), sizeLimit);
        listener = html ? new SwingHtmlFilter(bufListener) : new SwingPlainFilter(bufListener);

        try
        {
            parser.parse(new StringReader(aText), listener);
        } catch (IOException e)
        {
            // OK. Buffer will be empty.
        }
View Full Code Here

Examples of com.scraper.parser.HTMLParser

  }

  @Test
  public void testParser()
  {
    HTMLParser parser = HTMLParser.parseImages("not a url");
    assertTrue(parser.hasError());
    parser = HTMLParser.parseImages("www.yahoocom");
    assertTrue(parser.hasError());
    parser = HTMLParser.parseImages("www.yahoo.com");
    assertTrue(!parser.hasError());
    parser = HTMLParser.parseImages("http://www.yahoo.com");
    assertTrue(!parser.hasError());
    parser = HTMLParser.parseImages("https://www.yahoo.com/");
    assertTrue(!parser.hasError());
    assertTrue(parser.hasNextImage());
  }
View Full Code Here

Examples of com.substanceofcode.utils.HTMLParser

    throws IOException, CauseMemoryException, CauseException, Exception {
        /** Initialize item collection */
        Vector rssFeeds = new Vector();
       
        /** Initialize XML parser and parse OPML XML */
        HTMLParser parser = new HTMLParser(encodingUtil);
        try {
           
      // The first element is the main tag.
            int elementType = parser.parse();
      // If we found the prologue, get the next entry.
      if( elementType == XmlParser.PROLOGUE ) {
        elementType = parser.parse();
      }
      if (elementType == XmlParser.END_DOCUMENT ) {
        return null;
      }
           
      boolean bodyFound = false;
            do {
        if (elementType == HTMLParser.REDIRECT_URL) {
          RssItunesFeed [] feeds = new RssItunesFeed[1];
          feeds[0] = new RssItunesFeed("", parser.getRedirectUrl(),
              "", "");
          return feeds;
        }
        /** RSS item properties */
        String title = "";
        String link = "";
                       
        String tagName = parser.getName();
        //#ifdef DLOGGING
        if (finerLoggable) {logger.finer("tagname: " + tagName);}
        //#endif
        if (tagName.length() == 0) {
          continue;
        }
        switch (tagName.charAt(0)) {
          case 'm':
          case 'M':
            if (bodyFound) {
              break;
            }
            break;
          case 'b':
          case 'B':
            if (!bodyFound) {
              bodyFound = parser.isBodyFound();
            }
            break;
          case 'a':
          case 'A':
            //#ifdef DLOGGING
            if (finerLoggable) {logger.finer("Parsing <a> tag");}
            //#endif
           
            title = parser.getText();
            // Title can be 0 as this is used also for
            // getting
            title = title.trim();
            title = StringUtil.removeHtml( title );

            if (((link = parser.getAttributeValue( "href" ))
                  == null) || ( link.length() == 0 )) {
              continue;
            }
            link = link.trim();
            if ( link.length() == 0 ) {
              continue;
            }
            if (link.indexOf("://") >= 0) {
              if (!link.startsWith("http:") &&
                !link.startsWith("https:") &&
                !link.startsWith("file:") &&
                 !link.startsWith("jar:")) {
                //#ifdef DLOGGING
                if (finerLoggable) {logger.finer("Not support for protocol or no protocol=" + link);}
                //#endif
                continue;
              }
            } else {
              if (link.charAt(0) == '/') {
                int purl = url.indexOf("://");
                if ((purl + 4) >= url.length()) {
                  //#ifdef DLOGGING
                  if (finerLoggable) {logger.finer("Url too short=" + url + "," + purl);}
                  //#endif
                  continue;
                }
                int pslash = url.indexOf("/", purl + 3);
                String burl = url;
                if (pslash >= 0) {
                  burl = url.substring(0, pslash);
                }
                link = burl + link;
              } else {
                link = url + "/" + link;
              }
            }
           
            /** Debugging information */
            //#ifdef DLOGGING
            if (finerLoggable) {logger.finer("Title:       " + title);}
            if (finerLoggable) {logger.finer("Link:        " + link);}
            //#endif
            if (( feedURLFilter != null) &&
              ( link.toLowerCase().indexOf(feedURLFilter) < 0)) {
              continue;
            }
           
            if (( feedNameFilter != null) &&
              ((title != null) &&
              (title.toLowerCase().indexOf(feedNameFilter) < 0))) {
              continue;
            }
            RssItunesFeed feed = new RssItunesFeed(title, link, "", "");
            rssFeeds.addElement( feed );
            break;
          default:
        }
            }
            while( (elementType = parser.parse()) != XmlParser.END_DOCUMENT );
           
        } catch (CauseMemoryException ex) {
      CauseMemoryException cex = new CauseMemoryException(
          "Out of memory error while parsing HTML Link feed " + url,
          ex);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.