Package com.ikanow.infinit.e.data_model.store.document

Examples of com.ikanow.infinit.e.data_model.store.document.DocumentPojo


   */
  private DocumentPojo createDoc(CommitType commitType, ResultSet rs, ResultSetMetaData md,
      SourcePojo source, String docUrl)
  {
    // Set up Feed object to be used to contain information
    DocumentPojo doc = null;
 
    try
   
      // Check to see if the commit type is a insert or update
      if (commitType == CommitType.insert)
      {
        // create the doc pojo
        doc = new DocumentPojo();
        doc.setUrl(docUrl);
        doc.setCreated(new Date());       
      }
      else
     
        //TODO (INF-1300): support for updated docs (will want to save old creation time, zap everything else?)
      }
     
      doc.setModified(new Date());
     
      // Strip out html if it is present
      if (null != source.getDatabaseConfig().getTitle()) {
        if (rs.getString(source.getDatabaseConfig().getTitle()) != null)
        {
          doc.setTitle(rs.getString(source.getDatabaseConfig().getTitle()).replaceAll("\\<.*?\\>", ""));
        }
      }
     
      if (null != source.getDatabaseConfig().getSnippet()) {
        if (rs.getString(source.getDatabaseConfig().getSnippet()) != null)
        {
          doc.setDescription(rs.getString(source.getDatabaseConfig().getSnippet()).replaceAll("\\<.*?\\>", ""));
        }
      }

      if (null != source.getDatabaseConfig().getPublishedDate()) {
        if (rs.getString(source.getDatabaseConfig().getPublishedDate()) != null)
        {
          Object d = null;
          try
          {
            Object o = rs.getDate(source.getDatabaseConfig().getPublishedDate());
            d = convertJdbcTypes(null, o);
          }
          catch (Exception e)
          {
            d = new Date();
          }
          doc.setPublishedDate((Date) d);
        }
        else
        {
          doc.setPublishedDate(new Date());
        }
      }
      else
      {
        doc.setPublishedDate(new Date());
      }
     
          // Create a list of metadata to be added to the doc
      for ( int i = 1; i <= md.getColumnCount(); i++ )
      {
        String column = md.getColumnLabel(i);
        Object value = rs.getObject(i);
       
        // Convert value to standard Java type from JDBC type if needed
        value = convertJdbcTypes(column, value);
       
        if (  (column != null) && (value != null) )
        {
          if (!source.getDatabaseConfig().getPreserveCase()) {
            column = column.toLowerCase();
          }
          doc.addToMetadata(column, value);
        }
          }     
    }
    catch (SQLException e)
    {
View Full Code Here


      Iterator<DocumentPojo> it = documents.iterator();
      int nDocs = 0;
      while (it.hasNext()) {
        nDocs++;
        DocumentPojo d = it.next();
         regexDuplicates = new HashSet<String>();
        cleaner = null;

        // For feeds, may need to go get the document text manually,
        // it's a bit horrible since
        // obviously may then go get the data again for full text
        // extraction
        boolean bFetchedUrl = false;
        if (bGetRawDoc && (null == d.getFullText())) {
          if (null == source.getRssConfig()) {
            source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
          }
          // (first time through, sleep following a URL/RSS access)
          if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
            try {
              Thread.sleep(nBetweenDocs_ms);
            } catch (InterruptedException e) {
            }
          }
          // TESTED (first time only, correct value after searchConfig override)

          try {
            if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
              // Special case: if tika enabled then do that first
              if (null == tikaExtractor) {
                tikaExtractor = new TextExtractorTika();
                tikaExtractor.extractText(d);
              }
            }
            else {
              this.getRawTextFromUrlIfNeeded(d, source.getRssConfig());
            }
            bFetchedUrl = true;
           
          } catch (Exception e) { // Failed to get full text twice, remove doc
            if (e instanceof SecurityException) { // This seems worthy of actually logging, even though it's a lowly doc error
              contextController.getHarvestStatus().logMessage(e.getMessage(), true);
            }
            contextController.handleExtractError(e, source); //handle extractor error if need be       
            it.remove();
            d.setTempSource(null); // (can safely corrupt this doc since it's been removed)           
            continue;
          }
        }
        long nTime_ms = System.currentTimeMillis();
        // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)
View Full Code Here

            // (applies this value to sleeps inside UAH.executeHarvest)
            feedConfig.setWaitTimeOverride_ms(searchConfig.getWaitTimeBetweenPages_ms());
          }
          //TESTED (including RSS-level value being written back again and applied in SAH/UAH code)
         
          DocumentPojo searchDoc = docToSplit;
          Object[] savedMeta = null;
          if (null == searchDoc) {
            searchDoc = new DocumentPojo();
            // Required terms:
            searchDoc.setUrl(url);
            searchDoc.setScore((double)nIteratingDepth); // (spidering param)
            // Handy terms
            if (null != src.getHarvestStatus()) {
              searchDoc.setModified(src.getHarvestStatus().getHarvested()); // the last time the source was harvested - can use to determine how far back to go
            }
            // If these exist (they won't normally), fill them:
            searchDoc.setFullText(currFullText);
            searchDoc.setDescription(currDesc);
            searchDoc.setTitle(currTitle);
          }//TOTEST
          else if (null != searchDoc.getMetadata()){
            savedMeta = searchDoc.getMetadata().remove("searchEngineSubsystem");
              // (this is normally null)
          }//TOTEST
          UnstructuredAnalysisHarvester dummyUAH = new UnstructuredAnalysisHarvester();
          boolean bMoreDocs = (nPage < nMaxPages - 1);
          Object[] searchResults = null;
          try {
            dummyUAH.executeHarvest(context, src, searchDoc, false, bMoreDocs);
              // (the leading false means that we never sleep *before* the query, only after)
            searchResults = searchDoc.getMetaData().get("searchEngineSubsystem");
          }
          finally {
            if (null != savedMeta) { // (this is really obscure but handle the case where someone has created this meta field already)
              searchDoc.getMetadata().put("searchEngineSubsystem", savedMeta);             
            }
            else if ((null != searchDoc) && (null != searchDoc.getMetadata())) {
              searchDoc.getMetadata().remove("searchEngineSubsystem");
            }
          }//TOTEST
         
          //DEBUG
          //System.out.println("NEW DOC MD: " + new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(searchDoc.getMetadata()));
         
      // Create extraUrl entries from the metadata
     
          if ((null != searchResults) && (searchResults.length > 0)) {
            for (Object searchResultObj: searchResults) {
              try {
                BasicDBObject bsonObj = (BasicDBObject)searchResultObj;
               
                // 3 fields: url, title, description(=optional)
                String linkUrl = bsonObj.getString(DocumentPojo.url_);
               
                nLinksFound++;
                if (!dedupSet.contains(linkUrl)) {
                  dedupSet.add(linkUrl);                 
               
                  String linkTitle = bsonObj.getString(DocumentPojo.title_);
                  String linkDesc = bsonObj.getString(DocumentPojo.description_);
                  String linkPubDate = bsonObj.getString(DocumentPojo.publishedDate_);
                  String linkFullText = bsonObj.getString(DocumentPojo.fullText_);
                  String spiderOut = bsonObj.getString("spiderOut");
                 
                  if (null != linkUrl) {
                    SourceRssConfigPojo.ExtraUrlPojo link = new SourceRssConfigPojo.ExtraUrlPojo();
                    link.url = linkUrl;
                    link.title = linkTitle;
                    link.description = linkDesc;
                    link.publishedDate = linkPubDate;
                    link.fullText = linkFullText;
                    if (!stopLinkFollowing && (null != itUrls) && (null != spiderOut) && spiderOut.equalsIgnoreCase("true")) {
                      // In this case, add it back to the original list for chained processing
           
                      if (null == waitingList) {
                        waitingList = new LinkedList<ExtraUrlPojo>();
                      }
                      waitingList.add(link);
                        // (can't result in an infinite loop like this because we check
                         //  dedupSet.size() and only allow links not already in dedupSet)
                     
                    } //TESTED

                    if (null != linkTitle) {
                     
                      boolean isDuplicate = false;
                      if (!stopPaginating && searchConfig.getStopPaginatingOnDuplicate()) {
                        // Quick duplicate check (full one gets done later)
                        isDuplicate = context.getDuplicateManager().isDuplicate_Url(linkUrl, src, null);
                      }//TESTED                     
                      if (!isDuplicate) {
                        if (null == feedConfig.getExtraUrls()) {
                          feedConfig.setExtraUrls(new ArrayList<ExtraUrlPojo>(searchResults.length));
                        }
                        feedConfig.getExtraUrls().add(link);
                      }
                      else {
                        stopPaginating = true;
                        if (null == feedConfig.getSearchConfig().getPageChangeRegex()) {
                          stopLinkFollowing = true;
                        }//TESTED                     
                      }//TESTED
                    }
                   
                  }
                }//(end if URL not already found)
              }
              catch (Exception e) {
                // (just carry on)
                //DEBUG
                //e.printStackTrace();
              }
            }
          }//TESTED
          else if (0 == nPage) { //returned no links, log an error if this is page 1 and one has been saved
            Object[] onError = searchDoc.getMetaData().get("_ONERROR_");
            if ((null != onError) && (onError.length > 0) && (onError[0] instanceof String) && !(((String)(onError[0]))).isEmpty()) {
              throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: _ONERROR_: " + onError[0]);         
            }
          }//TESTED

          if (context.isStandalone()) { // debug mode, will display some additional logging
            Object[] onDebug = searchDoc.getMetaData().get("_ONDEBUG_");
            if ((null != onDebug) && (onDebug.length > 0)) {
              for (Object debug: onDebug) {
                if (debug instanceof String) {
                  context.getHarvestStatus().logMessage("_ONDEBUG_: " + (String)debug, true);                             
                }
View Full Code Here

       
        if (JsonToken.BEGIN_ARRAY == tok) {
          reader.beginArray();
          if (objectIdentifiers.isEmpty()) {
            while (reader.hasNext()) {
              DocumentPojo doc = convertJsonToDocument(reader, parser, textOnly);
              if (null != doc) {
                docList.add(doc);
                if (++nCurrDocs >= nMaxDocs) {
                  return docList;
                }
              }
            }
          }//TESTED
          else {
            while (reader.hasNext()) {
              getDocumentsFromJson(reader, parser, docList, false, textOnly);
            }
          }//TESTED
        }
        else if (JsonToken.BEGIN_OBJECT == tok) {
          if (objectIdentifiers.isEmpty()) {
            DocumentPojo doc = convertJsonToDocument(reader, parser, textOnly);
            if (null != doc) {
              docList.add(doc);
              if (++nCurrDocs >= nMaxDocs) {
                return docList;
              }
View Full Code Here

      return null;
    }
    //TESTED
   
    // Primary key and create doc
    DocumentPojo doc = new DocumentPojo();
    if ((null != primaryKey) && (null != sourceName)) {
      String primaryKey = getPrimaryKey(meta);
      if (null != primaryKey) {
        doc.setUrl(sourceName + primaryKey);
      }
    }
    if (textOnly) {
      doc.setFullText(meta.toString());
    }
    else {
      doc.setFullText("");
      if (meta.isJsonArray()) {
        ArrayList<Object> metaArray = new ArrayList<Object>(meta.getAsJsonArray().size());
        for (JsonElement je: meta.getAsJsonArray()) {
          if (je.isJsonObject()) {
            metaArray.add(convertJsonObjectToLinkedHashMap(je.getAsJsonObject(), _memUsage));
          }
        }
        doc.addToMetadata("json", metaArray.toArray());
      }
      else if (meta.isJsonObject()) {
        doc.addToMetadata("json", convertJsonObjectToLinkedHashMap(meta.getAsJsonObject(), _memUsage));
      }     
    }
    return doc;
   
  } //TESTED
View Full Code Here

     
      if (null == dbo) {
        rp.setResponse(new ResponseObject("Doc Info",true,"Document not found"));
        return rp;
      }
      DocumentPojo dp = DocumentPojo.fromDb(dbo, DocumentPojo.class);
      if (bReturnFullText)
      {
        if (null == dp.getFullText()) { // (Some things like database records might have this stored already)
          byte[] storageArray = new byte[200000];
          DBCollection contentDB = DbManager.getDocument().getContent();
          BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, dp.getUrl());
          contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, dp.getSourceKey())));
          BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
          BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields);
          if (null != dboContent) {
            byte[] compressedData = ((byte[])dboContent.get(CompressedFullTextPojo.gzip_content_));       
            ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
            GZIPInputStream gzip = new GZIPInputStream(in);       
            int nRead = 0;
            StringBuffer output = new StringBuffer();
            while (nRead >= 0) {
              nRead = gzip.read(storageArray, 0, 200000);
              if (nRead > 0) {
                String s = new String(storageArray, 0, nRead, "UTF-8");
                output.append(s);
              }
            }
            dp.setFullText(output.toString());
            dp.makeFullTextNonTransient();
          }
        }       
      }
      else if (!returnRawData) {
        dp.setFullText(null); // (obviously will normally contain full text anyway)
      }
      else // if ( returnRawData )
      {
        //check if the harvest type is file, return the file instead
        //if file is db return the json
        //get source
        SourcePojo source = getSourceFromKey(dp.getSourceKey());
        if ( source.getExtractType().equals( "File" ))
        {
          //get file from harvester
          String fileURL = dp.getUrl();
          if ( dp.getSourceUrl() != null )
            fileURL = dp.getSourceUrl();
          byte[] bytes = FileHarvester.getFile(fileURL, source);
          if ( bytes == null )
          {
            // Try returning JSON instead
            String json = ApiManager.mapToApi(dp, new DocumentPojoApiMap());
View Full Code Here

      if (bMatch) {
        JsonElement meta = parser.parse(reader);
       
        if (meta.isJsonObject()) { // (basically duplicates logic from convertJsonToDocument)
          if (textOnly || checkIfMandatoryFieldsExist(meta)) {
            DocumentPojo doc = new DocumentPojo();
            if ((null != primaryKey) && (null != sourceName)) {
              String primaryKey = getPrimaryKey(meta);
              if (null != primaryKey) {
                doc.setUrl(sourceName + primaryKey);
              }
            }
            if (textOnly) {
              doc.setFullText(meta.toString());
            }
            else {
              doc.addToMetadata("json", convertJsonObjectToLinkedHashMap(meta.getAsJsonObject(), _memUsage));
            }
            docList.add(doc);
            if (++nCurrDocs >= nMaxDocs) {
              while (reader.hasNext()) {
                reader.skipValue();
              }
              reader.endObject();
              return;
            }
          }
        }//TESTED
        else if (meta.isJsonArray()) {
          for (JsonElement meta2: meta.getAsJsonArray()) {
            if (textOnly || checkIfMandatoryFieldsExist(meta2)) {
              DocumentPojo doc = new DocumentPojo();
              if ((null != primaryKey) && (null != sourceName)) {
                String primaryKey = getPrimaryKey(meta2);
                if (null != primaryKey) {
                  doc.setUrl(sourceName + primaryKey);
                }
              }
              if (textOnly) {
                doc.setFullText(meta2.toString());
              }
              else {
                doc.addToMetadata("json", convertJsonObjectToLinkedHashMap(meta2.getAsJsonObject(), _memUsage));
              }
              docList.add(doc);           
              if (++nCurrDocs >= nMaxDocs) {
                while (reader.hasNext()) {
                  reader.skipValue();
View Full Code Here

           
            long count = DbManager.getCollection("ingest", requestId.toString()).count();
            if (count > 0) {
              DBCursor dbc = DbManager.getCollection("ingest", requestId.toString()).find().limit(context.getStandaloneMaxDocs());
              for (Object o: dbc) {
                DocumentPojo doc = new DocumentPojo();
                doc.addToMetadata("logstash_record", o);
                toAdd.add(doc);
              }
              error = logStashDbo.getString("error", "no info");
              context.getHarvestStatus().update(source,new Date(),HarvestEnum.success, "Logstash service info: " + error, false, false);         
              break;             
View Full Code Here

    BasicDBObject topDoc = null;
    if ((null != docs) && !docs.isEmpty()) {
      topDoc = docs.iterator().next();
      System.out.println("**TOP DOC=" + topDoc);
    }
    DocumentPojo newDoc = new DocumentPojo();
    newDoc.setTitle("test1 title: " + queryId);
    newDoc.setUrl("http://www.bbc.com");
    newDoc.setDescription("test1 desc");
    newDoc.setCreated(new Date());
    newDoc.setModified(new Date());
    newDoc.setPublishedDate(new Date());
    newDoc.setId(queryId);
    newDoc.setMediaType("Social");
    newDoc.addToMetadata("query", _query);
    newDoc.setSourceKey("test1");
    newDoc.setCommunityId(new ObjectId(_savedCommIdStrs[0]));
    if (null != topDoc) {
      newDoc.setScore(topDoc.getDouble(DocumentPojo.score_, 100.0));
      newDoc.setAggregateSignif(topDoc.getDouble(DocumentPojo.aggregateSignif_, 100.0));
    }
    else {
      newDoc.setScore(100.0);
      newDoc.setAggregateSignif(100.0);
    }
    if (null != docs) {
      docs.add(0, (BasicDBObject) newDoc.toDb());
    }
    response.getStats().found++;
  }
View Full Code Here

   */
  public List<DocumentPojo> parseDocument(XMLStreamReader reader) throws XMLStreamException {
    return parseDocument(reader, false);   
  }//TESTED (used by FileHarvester in this form, UAH::meta (stream) with textOnly==true below)
  public List<DocumentPojo> parseDocument(XMLStreamReader reader, boolean textOnly) throws XMLStreamException {
    DocumentPojo doc = new DocumentPojo();
    List<DocumentPojo> docList = new ArrayList<DocumentPojo>();
    boolean justIgnored = false;
    boolean hitIdentifier = false;
    nCurrDocs = 0;
    _memUsage = 0;

    StringBuffer fullText = new StringBuffer();
   
    while (reader.hasNext()) {
      int eventCode = reader.next();

      switch (eventCode)
      {
      case(XMLStreamReader.START_ELEMENT):
      {
        String tagName = reader.getLocalName();
     
        if (null == levelOneFields || levelOneFields.size() == 0) {
          levelOneFields = new ArrayList<String>();
          levelOneFields.add(tagName);
          doc = new DocumentPojo();
          sb.delete(0, sb.length());
          fullText.setLength(0);
          justIgnored = false;
        }
        else if (levelOneFields.contains(tagName)){
          sb.delete(0, sb.length());
          doc = new DocumentPojo();
          fullText.setLength(0);
          justIgnored = false;
        }
        else if ((null != ignoreFields) && ignoreFields.contains(tagName))
        {
          justIgnored = true;
        }
        else{
          if (this.bPreserveCase) {
            sb.append("<").append(tagName).append(">");         
          }
          else {
            sb.append("<").append(tagName.toLowerCase()).append(">");
          }
          justIgnored = false;
        }
        if (null != doc) {
          fullText.append("<").append(tagName);
          for (int ii = 0; ii < reader.getAttributeCount(); ++ii) {
            fullText.append(" ");
            fullText.append(reader.getAttributeLocalName(ii)).append("=\"").append(reader.getAttributeValue(ii)).append('"');
          }
          fullText.append(">");
        }//TESTED
       
        hitIdentifier = tagName.equalsIgnoreCase(PKElement);
       
        if (!justIgnored && (null != this.AttributePrefix)) { // otherwise ignore attributes anyway
          int nAttributes = reader.getAttributeCount();
          StringBuffer sb2 = new StringBuffer();
          for (int i = 0; i < nAttributes; ++i) {
            sb2.setLength(0);
            sb.append('<');
           
            sb2.append(this.AttributePrefix);
            if (this.bPreserveCase) {
              sb2.append(reader.getAttributeLocalName(i).toLowerCase());
            }
            else {
              sb2.append(reader.getAttributeLocalName(i));
            }
            sb2.append('>');
           
            sb.append(sb2);
            sb.append("<![CDATA[").append(reader.getAttributeValue(i).trim()).append("]]>");
            sb.append("</").append(sb2);
          }
        }
      }
      break;
     
      case (XMLStreamReader.CHARACTERS):
      {
        if (null != doc) {
          fullText.append(reader.getText());
        }//TESTED
       
        if(reader.getText().trim().length()>0 && justIgnored == false)
          sb.append("<![CDATA[").append(reader.getText().trim()).append("]]>");
        if(hitIdentifier)
        {
          String tValue = reader.getText().trim();
          if (null != XmlSourceName){
            if (tValue.length()> 0){
              doc.setUrl(XmlSourceName + tValue);
            }
          }
        }
      }
      break;
      case (XMLStreamReader.END_ELEMENT):
      {
        if (null != doc) {
          fullText.append("</").append(reader.getLocalName()).append(">");
        }//TESTED
       
        hitIdentifier = !reader.getLocalName().equalsIgnoreCase(PKElement);
        if ((null != ignoreFields) && !ignoreFields.contains(reader.getLocalName())){
          if (levelOneFields.contains(reader.getLocalName())) {
            JSONObject json;
            if (!textOnly) {
              try {
                json = XML.toJSONObject(sb.toString());
                for (String names: JSONObject.getNames(json))
                {
                  JSONObject rec = null;
                  JSONArray jarray = null;
   
                  try {
                    jarray = json.getJSONArray(names);
                    doc.addToMetadata(names, handleJsonArray(jarray, false));
                  } catch (JSONException e) {
                    try {
                      rec = json.getJSONObject(names);
                      doc.addToMetadata(names, convertJsonObjectToLinkedHashMap(rec));
                    } catch (JSONException e2) {
                      try {
                        Object[] val = {json.getString(names)};
                        doc.addToMetadata(names,val);
                      } catch (JSONException e1) {
                        e1.printStackTrace();
                      }
                    }
                  }
                }
   
              } catch (JSONException e) {
                e.printStackTrace();
              }
            }
            doc.setFullText(fullText.toString());
            _memUsage += sb.length()*4L; // 3x - one for full text, 3x for the object + overhead
            sb.setLength(0);
            sb.delete(0, sb.length());
           
            docList.add(doc);
View Full Code Here

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.