Package cc.twittertools.corpus.data

Examples of cc.twittertools.corpus.data.StatusStream


    }

    File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION));

    LOG.info("Indexing " + cmdline.getOptionValue(INPUT_OPTION));
    StatusStream stream;

    File file = new File(cmdline.getOptionValue(INPUT_OPTION));
    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    if (file.isDirectory()) {
      stream = new JsonStatusCorpusReader(file);
    } else {
      stream = new JsonStatusBlockReader(file);
    }

    Analyzer analyzer = ANALYZER;
    Similarity similarity = new ConstantNormSimilarity();
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    config.setSimilarity(similarity);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // Overwrite existing.

    IndexWriter writer = new IndexWriter(FSDirectory.open(indexLocation), config);

    int cnt = 0;
    Status status;
    try {
      while ((status = stream.next()) != null) {
        if (status.getText() == null) {
          continue;
        }

        cnt++;
        String createdAt = status.getCreatedAt();
        Document doc = new Document();
        doc.add(new Field(StatusField.ID.name, status.getId() + "",
            Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        doc.add(new Field(StatusField.SCREEN_NAME.name, status.getScreenname(),
            Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        doc.add(new Field(StatusField.CREATED_AT.name, createdAt, Store.YES, Index.NO));
        doc.add(new Field(StatusField.TEXT.name, status.getText(), Store.YES, Index.ANALYZED));

        String[] arr = createdAt.split(" ");
        String createDay = new StringBuffer().append(arr[1]).append("_").append(arr[2]).toString();
        doc.add(new Field(StatusField.DAY.name, createDay, Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        writer.addDocument(doc);
        if (cnt % 10000 == 0) {
          LOG.info(cnt + " statuses indexed");
        }
      }
      LOG.info("Optimizing index...");
      writer.optimize();
      writer.close();
    } finally {
      stream.close();
    }

    LOG.info(String.format("Total of %s statuses indexed", cnt));
  }
View Full Code Here


      System.exit(-1);
    }

    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    StatusStream stream;
    // Figure out if we're reading from HTML SequenceFiles or JSON.
    File file = new File(cmdline.getOptionValue(INPUT_OPTION));
    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    if (file.isDirectory()) {
      stream = new JsonStatusCorpusReader(file);
    } else {
      stream = new JsonStatusBlockReader(file);
    }

    int cnt = 0;
    Status status;
    while ((status = stream.next()) != null) {
      if (cmdline.hasOption(DUMP_OPTION)) {
        String text = status.getText();
        if (text != null) {
          text = text.replaceAll("\\s+", " ");
          text = text.replaceAll("\0", "");
        }
        out.println(String.format("%d\t%s\t%s\t%s", status.getId(), status.getScreenname(),
            status.getCreatedAt(), text));
      }
      cnt++;
      if ( cnt % 10000 == 0 && cmdline.hasOption(VERBOSE_OPTION)) {
        LOG.info(cnt + " statuses read");
      }
    }
    stream.close();
    LOG.info(String.format("Total of %s statuses read.", cnt));
  }
View Full Code Here

      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    StatusStream stream = new JsonStatusCorpusReader(file);
    Status status;
    while ((status = stream.next()) != null) {
      if (tweetids.contains(status.getId())) {
        out.println(status.getJsonObject().toString());
      }
    }
    stream.close();
    out.close();
  }
View Full Code Here

    LongOpenHashSet seen = new LongOpenHashSet();
    TreeMap<Long, String> tweets = Maps.newTreeMap();

    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    StatusStream stream = new JsonStatusCorpusReader(file);
    Status status;
    int cnt = 0;
    while ((status = stream.next()) != null) {
      if (!tweetids.contains(status.getId())) {
        LOG.error("tweetid " + status.getId() + " doesn't belong in collection");
        continue;
      }
      if (seen.contains(status.getId())) {
        LOG.error("tweetid " + status.getId() + " already seen!");
        continue;
      }

      tweets.put(status.getId(), status.getJsonObject().toString());
      seen.add(status.getId());
      cnt++;
    }
    LOG.info("total of " + cnt + " tweets in subcollection.");

    for ( Map.Entry<Long, String> entry : tweets.entrySet()){
      out.println(entry.getValue());
    }

    stream.close();
    out.close();
  }
View Full Code Here

  }

  public boolean verify() throws IOException {
    LOG.info(String.format("Reading statuses read from %s.", statuses));

    StatusStream stream;
    if (statuses.isDirectory()) {
      throw new RuntimeException(statuses + " cannot be a directory!");
    }
    stream = new JsonStatusBlockReader(statuses);

    Map<Long, String> ids = new HashMap<Long, String>();

    int cnt = 0;
    Status status;
    while ((status = stream.next()) != null) {
      ids.put(status.getId(), status.getJsonString());
      cnt++;
    }
    LOG.info(String.format("Total of %d statuses read.", cnt));
View Full Code Here

    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
      while ((status = stream.next()) != null) {
        if (status.getText() == null) {
          continue;
        }

        cnt++;
        Document doc = new Document();
        doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
        doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
        doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

        doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

        doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
        doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
        doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

        long inReplyToStatusId = status.getInReplyToStatusId();
        if (inReplyToStatusId > 0) {
          doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES));
          doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES));
        }
       
        String lang = status.getLang();
        if (!lang.equals("unknown")) {
          doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
        }
       
        long retweetStatusId = status.getRetweetedStatusId();
        if (retweetStatusId > 0) {
          doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
          doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES));
          doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
          if ( status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
            LOG.warn("Error parsing retweet fields of " + status.getId());
          }
        }
       
        writer.addDocument(doc);
        if (cnt % 100000 == 0) {
          LOG.info(cnt + " statuses indexed");
        }
      }

      LOG.info(String.format("Total of %s statuses added", cnt));
     
      if (cmdline.hasOption(OPTIMIZE_OPTION)) {
        LOG.info("Merging segments...");
        writer.forceMerge(1);
        LOG.info("Done!");
      }

      LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      writer.close();
      dir.close();
      stream.close();
    }
  }
View Full Code Here

    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Status status;
    while ((status = stream.next()) != null) {
      System.out.println(status.getId() + "\t" + status.getScreenname());
    }
  }
View Full Code Here

    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Status status;
    while ((status = stream.next()) != null) {
      System.out.println(status.getId() + "\t" + status.getScreenname());
    }
  }
View Full Code Here

    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
      while ((status = stream.next()) != null) {
        if (status.getText() == null) {
          continue;
        }

        // Skip deletes tweetids.
        if (deletes != null && deletes.contains(status.getId())) {
          continue;
        }

        if (status.getId() > maxId) {
          continue;
        }

        cnt++;
        Document doc = new Document();
        doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
        doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
        doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

        doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

        doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
        doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
        doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

        long inReplyToStatusId = status.getInReplyToStatusId();
        if (inReplyToStatusId > 0) {
          doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES));
          doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES));
        }
       
        String lang = status.getLang();
        if (!lang.equals("unknown")) {
          doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
        }
       
        long retweetStatusId = status.getRetweetedStatusId();
        if (retweetStatusId > 0) {
          doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
          doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES));
          doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
          if ( status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
            LOG.warn("Error parsing retweet fields of " + status.getId());
          }
        }
       
        writer.addDocument(doc);
        if (cnt % 100000 == 0) {
          LOG.info(cnt + " statuses indexed");
        }
      }

      LOG.info(String.format("Total of %s statuses added", cnt));
     
      if (cmdline.hasOption(OPTIMIZE_OPTION)) {
        LOG.info("Merging segments...");
        writer.forceMerge(1);
        LOG.info("Done!");
      }

      LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      writer.close();
      dir.close();
      stream.close();
    }
  }
View Full Code Here

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    long startTime = System.currentTimeMillis();

    StatusStream stream;

    File file = new File(collectionPath);
    if (!file.exists()) {
      System.err.println("Error: " + file + " does not exist!");
      System.exit(-1);
    }

    if (cmdline.hasOption(TSV_OPTION)) {
      stream = new TSVStatusCorpusReader(file);
    } else {
      stream = new JsonStatusCorpusReader(file);
    }

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41);

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_41, analyzer);
    config.setOpenMode(OpenMode.CREATE);

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
      while ((status = stream.next()) != null) {
        if (status.getText() == null) {
          continue;
        }

        cnt++;
        Document doc = new Document();
        doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
        doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
        doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
        doc.add(new TextField(StatusField.TEXT.name, status.getText(), Store.YES));

        doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
        doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
        doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

        long inReplyToStatusId = status.getInReplyToStatusId();
        if (inReplyToStatusId > 0) {
          doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES));
          doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES));
        }
       
        String lang = status.getLang();
        if (!lang.equals("unknown")) {
          doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
        }
       
        long retweetStatusId = status.getRetweetedStatusId();
        if (retweetStatusId > 0) {
          doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
          doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES));
          doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
          if ( status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
            LOG.warn("Error parsing retweet fields of " + status.getId());
          }
        }
       
        writer.addDocument(doc);
        if (cnt % 100000 == 0) {
          LOG.info(cnt + " statuses indexed");
        }
      }

      LOG.info(String.format("Total of %s statuses added", cnt));
      LOG.info("Merging segments...");
      writer.forceMerge(1);
      LOG.info("Done!");
      LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      writer.close();
      dir.close();
      stream.close();
    }
  }
View Full Code Here

TOP

Related Classes of cc.twittertools.corpus.data.StatusStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.