Package org.sf.mustru.docs

Examples of org.sf.mustru.docs.IndexableDoc


  Properties props = null; StarHandler sh = null;
  try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
  catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
 
  String filename = "/home/manuk/html/akr/ebooks/MYSQLmanual-a4.pdf";
  IndexableDoc doc = new IndexableDoc();
  sh.getDocument(filename, doc);
  int contentSizeLimit = Constants.DOC_LENGTH_MAXLIMIT;
  int textSize = doc.getContents().length();
  if (textSize > contentSizeLimit )
       doc.setContents( (new StringBuffer( doc.getContents().substring(0, contentSizeLimit ) ) ) );
 
  LingpipeTools sentTools = new LingpipeTools();
  sentTools.buildSentences(doc.getContents().toString()); String sentence; int i = 0;
  while ( (sentence = sentTools.nextSentence()) != null)
    { System.out.println(" " + i + "( " + sentence.length() + " ) : " +  sentence); i++;
 
  System.out.println("Ended testPdfFile");
}
View Full Code Here


     String key = doc.get("key");
     DatabaseEntry data = new DatabaseEntry();
     if (!dbt.fetch(key, data)) continue LOOP;

     //*-- extract the text
     IndexableDoc idoc = new IndexableDoc();
     idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
     String line= idoc.getContents().toString();
     if (line.length() > 1000) line = line.substring(0, 999);
 
     retv.append(" Score: " + hits.score(i) + " TEXT: " + line + Constants.NEWLINE);
     retv.append(explanation.toString());
     retv.append("------------------------------------------------------------------");
View Full Code Here

  //*-- open the database environment
  boolean readOnly = true; boolean dupFlag = false;
  dbt.openDB(Constants.EXT_FILES_DB, readOnly, dupFlag);
 
  BooleanQuery query = new BooleanQuery();
  TupleBinding tpb = new IndexableDoc().getBdbBinding();
  DatabaseEntry data = new DatabaseEntry();
  if ( dbt.fetch(key, data) )
   {
     //*-- extract the text of the document
     IndexableDoc idoc = (IndexableDoc) tpb.entryToObject(data);
     String docText = idoc.getContents().toString();
   
     //*-- tokenize the text
     analyzer.setExtractEntities(false);
     Token[] tokens = null;
     try { tokens = tokensFromAnalysis(analyzer, docText); }
View Full Code Here

   try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
   catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
 
   for (int i = 0; i < sampleFiles.length; i++)
   { String filename = sampleDir + sampleFiles[i];
     IndexableDoc doc = new IndexableDoc();
     sh.getDocument(filename, doc);
     String contents = doc.getContents().toString();
     contents = (contents.length() > 2000) ? contents.substring(0, 2000): contents;
     System.out.println("File: " + sampleFiles[i] + " has content: " + contents);
   }
   System.out.println("Finished TestFilters");
  }
View Full Code Here

    String docClass = dprops.getProperty(ftype);
    if ( (docClass == null) || (docClass.equals("")) ) docClass = "org.sf.mustru.docs.TextDoc";
    Class docType = Class.forName(docClass);
    results[j++] = " Hit: " + i + ". Type: " + docClass + " Key: " + key + " Score: " + hits.score(i);

    IndexableDoc idoc =  (IndexableDoc) docType.newInstance();
    idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);     
    String contents = idoc.getContents().toString(); contents = StringTools.filterChars(contents);
    contents = StringTools.fillin(contents, 800, '.');
    results[j++] = "  File Name: " + idoc.getFileName();
    results[j++] = "  File Location: " + idoc.getFileLocation();
    results[j++] = "  File Type: " + idoc.getFileType();
    Method method = idoc.getClass().getMethod("getTextType", new Class[] {}  ); method.setAccessible(true);
    results[j++] = "  Text type: " + (String) method.invoke(idoc, new Object[] {} );
    results[j++] = "  File Size: " + idoc.getFileLength();
    results[j++] = "  File Rank: " + idoc.getFileRank();
    results[j++] = "  Title: " + idoc.getTitle();
    results[j++] = "  Author: " + idoc.getAuthor();
    results[j++] = "  Language: " + idoc.getLanguage();
    j = dumpResults(contents, j);
    results[j++] = "";
    if (explain
    { Explanation exp = is.explain(query, hits.id(i));
      results[j++] = "  Explanation: " + exp; }
View Full Code Here

   //*-- database for the file signature
   dbt.dropDB(Constants.EXT_FILES_DB);
   dbt.createDB(Constants.EXT_FILES_DB, createFlag, dupFlag);
   logger.info("Created py. database " + Constants.EXT_FILES_DB);
   dbt.dropSecDB(Constants.EXT_FILES_SECDB);
   SecKeyDoc skd = new SecKeyDoc( new IndexableDoc().getBdbBinding() );
   if (dbt.createSecDB(Constants.EXT_FILES_SECDB, false, skd))
    logger.info("Created sec. database " + Constants.EXT_FILES_SECDB);
   dbt.closeSecDB(); dbt.closeDB();

   //*-- create the messages database
View Full Code Here

  //*-- delete the entry and keep track of the deletions
  DbTools dbt = Constants.getDbt();
  dbt.openDB(Constants.EXT_FILES_DB, false, false);
  Cursor cursor = null; int count = 0; int errors = 0;
  ArrayList<String> delFiles = new ArrayList<String>();
  IndexableDoc idoc = new IndexableDoc();
  try
  {
   cursor = dbt.getCurrentDB().openCursor(null, null);
   DatabaseEntry key = new DatabaseEntry();
   DatabaseEntry data = new DatabaseEntry();
   LOOP: while (cursor.getNext(key, data, LockMode.DEFAULT) == OperationStatus.SUCCESS)
   {
    //*-- check if the file name exists in the filesystem
    if (!running) break LOOP;
    String filename = new String( key.getData(), "UTF-8");
    File file = new File(filename);
    if (!file.exists())
    { out.append("ERROR: File " + filename + " does not exist" + Constants.NEWLINE);
    idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);

    //*-- remove the entry from the database
    if (!dbt.delete(filename))
     logger.error("Could not delete " + filename + " from the database");
    else
View Full Code Here

  docTypes[j++] = key; }

  //*-- scan the database by file name and keep track of the types of files in the index
  DbTools dbt = Constants.getDbt();  
  dbt.openDB(Constants.EXT_FILES_DB, true, false);
  Cursor cursor = null; IndexableDoc idoc = new IndexableDoc();
  try
  {
   cursor = dbt.getCurrentDB().openCursor(null, null);
   DatabaseEntry key = new DatabaseEntry();
   DatabaseEntry data = new DatabaseEntry(); int i = 0;
   LOOP: while (cursor.getNext(key, data, LockMode.DEFAULT) == OperationStatus.SUCCESS)
   {
    if (!running) break LOOP;
    idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
    String docType = idoc.getFileType();
    if (docType == null) docType = "unknown";
    String num =  stats.get(docType);
    if ( num == null) { docType = "unknown"; num =  stats.get(docType); }
    int val =  Integer.parseInt(num) + 1;
   
View Full Code Here

   String key = doc.get("key");
   DatabaseEntry data = new DatabaseEntry();            
   if (!dbt.fetch(key, data)) continue LOOP;

   //*-- extract the passage text
   IndexableDoc idoc = new IndexableDoc();
   idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data);
   String line= idoc.getContents().toString();
   String passage[] = SearchTools.bestPassages(line, sQuestion.getNouns(), sQuestion.getVerbs(), sQuestion.getAdjectives(),
                     sQuestion.getBigrams(), sQuestion.getEntities());
  
   //*-- get the file name
   matcher = trimPattern.matcher(key); if (matcher != null) key = matcher.replaceFirst("");
View Full Code Here

   {
    Doc document = new Doc();
    String docClass = dprops.getProperty(ftype);
    if ( (docClass == null) || (docClass.equals("")) ) docClass = "org.sf.mustru.docs.TextDoc";
    Class docType = Class.forName(docClass);
    IndexableDoc idoc =  (IndexableDoc) docType.newInstance();
    idoc = (IndexableDoc) idoc.getBdbBinding().entryToObject(data)
  
    //*-- set the snippet of the document for the bean
    String contents = idoc.getContents().toString(); contents = StringTools.filterChars(contents);
    if (contents.length() > 5000) contents = contents.substring(0, 5000);
    idoc.setContents(new StringBuffer(snippet(contents)));
      
    //*-- handle the different types of documents     
    copyBean(idoc, document);
    document.setFormatFileLength(nf.format(document.getFileLength() / 1000.0) );
    document.setFormatMdate( df.format( new Date(document.getMdate()) ) );
View Full Code Here

TOP

Related Classes of org.sf.mustru.docs.IndexableDoc

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.