Package org.terrier.indexing.tokenisation

Examples of org.terrier.indexing.tokenisation.EnglishTokeniser$EnglishTokenStream


    } catch (IOException ioe) {
      logger.fatal("ERROR: Problem opening TRECDocument test file : "+ ioe);
      logger.fatal("Exiting ...");
      ioe.printStackTrace();
    }
    return new TaggedDocument(b, null, new EnglishTokeniser());
  }
View Full Code Here


    Document[] sourceDocs = new Document[docnos.length];
    for(int i=0;i<docnos.length;i++)
    {
      Map<String,String> docProperties = new HashMap<String,String>();
      docProperties.put("filename", docnos[i]);
      sourceDocs[i] = new FileDocument(new ByteArrayInputStream(documents[i].getBytes()), docProperties, new EnglishTokeniser());
    }
    Collection col = new CollectionDocumentList(sourceDocs, "filename");
    indexer.index(new Collection[]{col});   
    Index index = Index.createIndex();
    assertEquals(sourceDocs.length, index.getCollectionStatistics().getNumberOfDocuments());
View Full Code Here

    Map<String,String> doc1Props = new HashMap<String,String>();doc1Props.put("filename", "doc1");
    Map<String,String> doc2Props = new HashMap<String,String>();doc2Props.put("filename", "doc2");
   
    Document[] sourceDocs = !fieldsExpected ?
        new Document[]{
            new FileDocument("doc1", new ByteArrayInputStream("cats dogs horses".getBytes()), new EnglishTokeniser()),
            new FileDocument("doc2", new ByteArrayInputStream("chicken cats chicken chicken".getBytes()), new EnglishTokeniser())
          }
        : new Document[]{
            new TaggedDocument(new ByteArrayInputStream("<title>cats</title> dogs horses".getBytes()), doc1Props, new EnglishTokeniser()),
            new TaggedDocument(new ByteArrayInputStream("<title>chicken</title> cats chicken chicken".getBytes()), doc2Props, new EnglishTokeniser())
          };
           
    int[] doclens = new int[]{3, 4};
   
   
View Full Code Here

TOP

Related Classes of org.terrier.indexing.tokenisation.EnglishTokeniser$EnglishTokenStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.