Package org.jsoup.nodes

Examples of org.jsoup.nodes.Document.select()


        if (doc == null) {
          reporter.incrCounter(this._counterGroup, "Skipped - Unable to Parse HTML", 1);
          return;
        }

        Elements mf = doc.select("[itemtype~=schema.org]");

        if (mf.size() > 0) {
          for (Element e : mf) {
            if (e.hasAttr("itemtype")) {
              output.collect(new Text(e.attr("itemtype").toLowerCase().trim()), new LongWritable(1));
View Full Code Here


            // fsyprint("Fetching %s...", url);
            String last = "";
            final String out = Constants.DATA_PATH + "playlist/"
                + playListName + ".plist";
            final Document doc = Jsoup.connect(url).get();
            final Elements links = doc.select("a[href]");
            final File playListOut = new File(out);
            final FileOutputStream fos = new FileOutputStream(
                playListOut);
            final BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fos));
View Full Code Here

    public JsonObject extractTweet(String html)
  throws java.net.MalformedURLException, java.io.UnsupportedEncodingException {
  JsonObject status = new JsonObject();

  Document doc = Jsoup.parse(html);
  Element tweet_div = doc.select("div.permalink-tweet").first();

  String tweet_text = tweet_div.select("p.tweet-text").first().text();
  status.addProperty("text", tweet_text);

  String tweet_id = tweet_div.attr("data-tweet-id");
View Full Code Here

  String tweet_id = tweet_div.attr("data-tweet-id");
  status.addProperty("id_str", tweet_id);
  status.addProperty("id", Long.parseLong(tweet_id));

  String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
  Date created_at = new Date();
  created_at.setTime(Long.parseLong(timestamp) * 1000);
  status.addProperty("created_at", date_fmt.format(created_at));

  Elements js_stats_retweets = doc.select("li.js-stat-retweets");
View Full Code Here

  String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
  Date created_at = new Date();
  created_at.setTime(Long.parseLong(timestamp) * 1000);
  status.addProperty("created_at", date_fmt.format(created_at));

  Elements js_stats_retweets = doc.select("li.js-stat-retweets");
  if (!js_stats_retweets.isEmpty()) {
      status.addProperty("retweeted", true);
      String count = js_stats_retweets.select("strong").first().text();
      status.addProperty("retweet_count", Long.parseLong(count));
  } else {
View Full Code Here

      status.addProperty("retweet_count", Long.parseLong(count));
  } else {
      status.addProperty("retweeted", false);
      status.addProperty("retweet_count", 0);
  }
  Elements js_stats_favs = doc.select("li.js-stat-favorites");
  status.addProperty("favorited", !js_stats_favs.isEmpty());
     

  // User subfield
  JsonObject user = new JsonObject();
View Full Code Here

  user.addProperty("name", user_name);
 
  status.add("user", user);
 
  // Geo information
  Elements tweet_loc = doc.select("a.tweet-geo-text");
  if (!tweet_loc.isEmpty()) {
      JsonObject location = new JsonObject();
      Element loc = tweet_loc.first();
      // Adding http to avoid malformed URL exception
      URL url = new URL("http:" + loc.attr("href"));
View Full Code Here

            // print("Fetching %s...", url);
            String last = "";
            final String out = Constants.DATA_PATH + "playlist/"
                + playListName + ".plist";
            final Document doc = Jsoup.connect(url).get();
            final Elements links = doc.select("iframe");

            final File playListOut = new File(out);
            final FileOutputStream fos = new FileOutputStream(
                playListOut);
            final BufferedWriter bw = new BufferedWriter(
View Full Code Here

            // print("Fetching %s...", url);
            String last = "";
            final String out = Constants.DATA_PATH + "playlist/"
                + playListName + ".plist";
            final Document doc = Jsoup.connect(url).get();
            final Elements links = doc.select("a[href]");
            final File playListOut = new File(out);
            final FileOutputStream fos = new FileOutputStream(
                playListOut);
            final BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fos));
View Full Code Here

  }

  public void transformString(String channelString) {
    Document doc = Jsoup.parse(channelString);
    Elements tmp;
    tmp = doc.select("alias");
    if (tmp != null) {
      this.alias = (tmp.text());
    }
    tmp = doc.select("thumbImageUrl");
    if (tmp != null) {
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.