Package org.apache.manifoldcf.agents.output.gts

Source Code of org.apache.manifoldcf.agents.output.gts.HttpPoster

/* $Id: HttpPoster.java 988245 2010-08-23 18:39:35Z kwright $ */

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.agents.output.gts;

import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.core.common.Base64;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.agents.system.*;

import java.io.*;
import java.net.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import javax.net.*;
import javax.net.ssl.*;

import org.apache.log4j.*;

/**
* Posts an input stream to the GTS
*
*/
public class HttpPoster
{
  public static final String _rcsid = "@(#)$Id: HttpPoster.java 988245 2010-08-23 18:39:35Z kwright $";

  /** Ingestion buffer size property. */
  public static String ingestBufferSizeProperty = "org.apache.manifoldcf.ingest.buffersize";
  public static String ingestCredentialsRealm = "org.apache.manifoldcf.ingest.credentialrealm";
  public static String ingestResponseRetryCount = "org.apache.manifoldcf.ingest.responseretrycount";
  public static String ingestResponseRetryInterval = "org.apache.manifoldcf.ingest.retryinterval";
  public static String ingestRescheduleInterval = "org.apache.manifoldcf.ingest.rescheduleinterval";
  public static String ingestURIProperty = "org.apache.manifoldcf.ingest.uri";
  public static String ingestUserProperty = "org.apache.manifoldcf.ingest.user";
  public static String ingestPasswordProperty = "org.apache.manifoldcf.ingest.password";
  public static String ingestMaxConnectionsProperty = "org.apache.manifoldcf.ingest.maxconnections";

  // Chunk size for base64-encoded headers
  protected final static int HEADER_CHUNK = 4096;

  private String encodedCredentials = null;
  private String realm = null;
  private String postURI = null;
  private URL url = null;
  private URL deleteURL = null;
  private URL infoURL = null;
  private String host = null;
  private int port = 80;
  private String protocol = null;

  /** Default buffer size */
  private final int buffersize;
  /** Size coefficient */
  private static double sizeCoefficient = 0.0005;    // 20 ms additional timeout per 2000 bytes, pulled out of my butt
  /** the number of times we should poll for the response */
  private final int responseRetries;
  /** how long we should wait before checking for a new stream */
  private final long responseRetryWait;
  /** How long to wait before retrying a failed ingestion */
  private final long interruptionRetryTime;

  /** This is the secure socket factory we will use.  I'm presuming it's thread-safe, but
  * if not, synchronization blocks are in order when it's used. */
  protected static javax.net.ssl.SSLSocketFactory secureSocketFactory = null;
  static
  {
    try
    {
      secureSocketFactory = getSecureSocketFactory();
    }
    catch (ManifoldCFException e)
    {
      // If we can't create, print and fail
      e.printStackTrace();
      System.exit(100);
    }
  }

  /**
  * Initialized the http poster.
  * @param userID is the unencoded user name, or null.
  * @param password is the unencoded password, or null.
  * @param postURI the uri to post the request to
  */
  public HttpPoster(IThreadContext threadContext, String realm, String userID, String password, String postURI)
    throws ManifoldCFException
  {
    if (userID != null && userID.length() > 0 && password != null)
    {
      this.encodedCredentials = new org.apache.manifoldcf.core.common.Base64().encodeByteArray((userID+":"+password).getBytes(StandardCharsets.UTF_8));
      this.realm = realm;
    }
    this.postURI = postURI;

    // Create a URL to GTS
    try
    {
      url = new URL(postURI);
      deleteURL = new URL(postURI+"?DELETE");
      infoURL = new URL(postURI+"?STATUS");
    }
    catch (MalformedURLException murl)
    {
      throw new ManifoldCFException("Bad url",murl);
    }

    // set the port
    port = url.getPort();
    host = url.getHost();
    protocol = url.getProtocol();
    if (port == -1)
    {
      if (protocol.equalsIgnoreCase("https"))
        port = 443;
      else
        port = 80;
    }

    buffersize = LockManagerFactory.getIntProperty(threadContext,ingestBufferSizeProperty,32768);
    responseRetries = LockManagerFactory.getIntProperty(threadContext,ingestResponseRetryCount,9000);
    responseRetryWait = LockManagerFactory.getIntProperty(threadContext,ingestResponseRetryInterval,20);
    interruptionRetryTime = LockManagerFactory.getIntProperty(threadContext,ingestRescheduleInterval,60000);
  }

  /**
  * Post the input stream to ingest
  * @param documentURI is the document's uri.
  * @param document is the document structure to ingest.
  * @return true if the ingestion was successful, or false if the ingestion is illegal.
  * @throws ManifoldCFException, ServiceInterruption
  */
  public boolean indexPost(String documentURI,
    String[] collections, String documentTemplate, String authorityNameString,
    RepositoryDocument document, IOutputAddActivity activities)
    throws ManifoldCFException, ServiceInterruption
  {
    StringBuilder aclXml = new StringBuilder();
    Iterator<String> securityTypeIterator = document.securityTypesIterator();
    String[] shareAcls = null;
    String[] shareDenyAcls = null;
    String[] documentAcls = null;
    String[] documentDenyAcls = null;
    String[] parentAcls = null;
    String[] parentDenyAcls = null;
    while (securityTypeIterator.hasNext())
    {
      String securityType = securityTypeIterator.next();
      if (securityType.equals(RepositoryDocument.SECURITY_TYPE_SHARE))
      {
        shareAcls = document.getSecurityACL(securityType);
        shareDenyAcls = document.getSecurityDenyACL(securityType);
      }
      else if (securityType.equals(RepositoryDocument.SECURITY_TYPE_DOCUMENT))
      {
        documentAcls = document.getSecurityACL(securityType);
        documentDenyAcls = document.getSecurityDenyACL(securityType);
      }
      else if (securityType.equals(RepositoryDocument.SECURITY_TYPE_PARENT))
      {
        parentAcls = document.getSecurityACL(securityType);
        parentDenyAcls = document.getSecurityDenyACL(securityType);
      }
      else
        // Can't accept the document, because we don't know how to secure it
        return false;
    }

    writeACLs(aclXml,"share",shareAcls,shareDenyAcls,authorityNameString,activities);
    writeACLs(aclXml,"directory",parentAcls,parentDenyAcls,authorityNameString,activities);
    writeACLs(aclXml,"file",documentAcls,documentDenyAcls,authorityNameString,activities);

    if (aclXml.length() > 0)
      aclXml.append("</document-acl>");
    String aclXmlString = aclXml.toString();

    if (Logging.ingest.isDebugEnabled())
      Logging.ingest.debug("indexPost(): '" + documentURI + "'");

    // This flag keeps track of whether we read anything from the input stream yet.
    // If not, we can retry here.  If so, we have to reschedule.
    boolean readFromDocumentStreamYet = false;
    int ioErrorRetry = 3;

    while (true)
    {
      try
      {
        IngestThread t = new IngestThread(documentURI,aclXmlString,collections,documentTemplate,document);
        try
        {
          t.start();
          t.join();

          // Log the activity, if any, regardless of any exception
          if (t.getActivityCode() != null)
            activities.recordActivity(t.getActivityStart(),GTSConnector.INGEST_ACTIVITY,t.getActivityBytes(),documentURI,t.getActivityCode(),t.getActivityDetails());

          readFromDocumentStreamYet = (readFromDocumentStreamYet || t.getReadFromDocumentStreamYet());

          Throwable thr = t.getException();
          if (thr != null)
          {
            if (thr instanceof ServiceInterruption)
              throw (ServiceInterruption)thr;
            if (thr instanceof ManifoldCFException)
              throw (ManifoldCFException)thr;
            if (thr instanceof IOException)
              throw (IOException)thr;
            if (thr instanceof RuntimeException)
              throw (RuntimeException)thr;
            else
              throw (Error)thr;
          }
          return t.getRval();
        }
        catch (InterruptedException e)
        {
          t.interrupt();
          throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED);
        }
      }
      catch (java.net.SocketTimeoutException ioe)
      {
        if (readFromDocumentStreamYet || ioErrorRetry == 0)
        {
          // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
          long currentTime = System.currentTimeMillis();
          throw new ServiceInterruption("IO error connecting to ingestion API: "+ioe.getMessage()+"; ingestion will be retried again later",
            ioe,
            currentTime + interruptionRetryTime,
            currentTime + 2L * 60L * 60000L,
            -1,
            true);
        }
      }
      catch (IOException ioe)
      {
        if (readFromDocumentStreamYet || ioErrorRetry == 0)
        {
          // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
          long currentTime = System.currentTimeMillis();
          throw new ServiceInterruption("IO error ingesting document: "+ioe.getMessage()+"; ingestion will be retried again later",
            ioe,
            currentTime + interruptionRetryTime,
            currentTime + 2L * 60L * 60000L,
            -1,
            true);
        }
      }

      // Sleep for a time, and retry
      try
      {
        ManifoldCF.sleep(10000L);
      }
      catch (InterruptedException e)
      {
        throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED);
      }
      ioErrorRetry--;

      // Go back around again!

    }

  }

  /** Write acls into a StringBuilder */
  protected static void writeACLs(StringBuilder aclXml, String type, String[] acl, String[] denyAcl, String authorityNameString, IOutputAddActivity activities)
    throws ManifoldCFException
  {
    if (acl != null && acl.length > 0 || denyAcl != null && denyAcl.length > 0)
    {
      if (aclXml.length() == 0)
        aclXml.append("<document-acl>");
      aclXml.append("<acl scope=\"").append(type).append("\">");
      if (acl != null)
      {
        for (int i=0; i < acl.length; i++)
        {
          if (Logging.ingest.isDebugEnabled())
            Logging.ingest.debug("Adding "+type+" ACL: " + acl[i]);
          aclXml.append("<allow>");
          aclXml.append(activities.qualifyAccessToken(authorityNameString,acl[i]));
          aclXml.append("</allow>");
        }
      }
      if (denyAcl != null)
      {
        for (int i=0; i < denyAcl.length; i++)
        {
          if (Logging.ingest.isDebugEnabled())
            Logging.ingest.debug("Adding "+type+" deny ACL: " + denyAcl[i]);
          aclXml.append("<deny>");
          aclXml.append(activities.qualifyAccessToken(authorityNameString,denyAcl[i]));
          aclXml.append("</deny>");
        }
      }
      aclXml.append("</acl>");
    }
  }

  /** Post a check request.
  */
  public void checkPost()
    throws ManifoldCFException, ServiceInterruption
  {
    if (Logging.ingest.isDebugEnabled())
      Logging.ingest.debug("checkPost()");

    int ioErrorRetry = 5;
    while (true)
    {
      // Open a socket to ingest, and to the response stream to get the post result
      try
      {
        StatusThread t = new StatusThread();
        try
        {
          t.start();
          t.join();

          Throwable thr = t.getException();
          if (thr != null)
          {
            if (thr instanceof ServiceInterruption)
              throw (ServiceInterruption)thr;
            if (thr instanceof ManifoldCFException)
              throw (ManifoldCFException)thr;
            if (thr instanceof IOException)
              throw (IOException)thr;
            if (thr instanceof RuntimeException)
              throw (RuntimeException)thr;
            else
              throw (Error)thr;
          }
          return;
        }
        catch (InterruptedException e)
        {
          t.interrupt();
          throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED);
        }
      }
      catch (IOException ioe)
      {
        if (ioErrorRetry == 0)
        {
          long currentTime = System.currentTimeMillis();
          throw new ServiceInterruption("IO exception checking: "+ioe.getMessage(),
            ioe,
            currentTime + interruptionRetryTime,
            currentTime + 2L * 60L * 60000L,
            -1,
            true);
        }
      }

      // Go back around again!
      // Sleep for a time, and retry
      try
      {
        ManifoldCF.sleep(10000L);
      }
      catch (InterruptedException e)
      {
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }
      ioErrorRetry--;

    }

  }

  /** Post a delete request.
  *@param documentURI is the document's URI.
  */
  public void deletePost(String documentURI, IOutputRemoveActivity activities)
    throws ManifoldCFException, ServiceInterruption
  {
    if (Logging.ingest.isDebugEnabled())
      Logging.ingest.debug("deletePost(): '" + documentURI + "'");

    int ioErrorRetry = 5;
    while (true)
    {
      try
      {
        DeleteThread t = new DeleteThread(documentURI);
        try
        {
          t.start();
          t.join();

          // Log the activity, if any, regardless of any exception
          if (t.getActivityCode() != null)
            activities.recordActivity(t.getActivityStart(),GTSConnector.REMOVE_ACTIVITY,null,documentURI,t.getActivityCode(),t.getActivityDetails());

          Throwable thr = t.getException();
          if (thr != null)
          {
            if (thr instanceof ServiceInterruption)
              throw (ServiceInterruption)thr;
            if (thr instanceof ManifoldCFException)
              throw (ManifoldCFException)thr;
            if (thr instanceof IOException)
              throw (IOException)thr;
            if (thr instanceof RuntimeException)
              throw (RuntimeException)thr;
            else
              throw (Error)thr;
          }
          return;
        }
        catch (InterruptedException e)
        {
          t.interrupt();
          throw new ManifoldCFException("Interrupted: "+e.getMessage(),ManifoldCFException.INTERRUPTED);
        }
      }
      catch (IOException ioe)
      {
        if (ioErrorRetry == 0)
        {
          long currentTime = System.currentTimeMillis();
          throw new ServiceInterruption("IO exception deleting: "+ioe.getMessage()+"; deletion will be retried again later",
            ioe,
            currentTime + interruptionRetryTime,
            currentTime + 2L * 60L * 60000L,
            -1,
            true);
        }
        // Fall through and recycle
      }

      // Go back around again!
      // Sleep for a time, and retry
      try
      {
        ManifoldCF.sleep(10000L);
      }
      catch (InterruptedException e)
      {
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }

      ioErrorRetry--;

    }

  }

  /**
  * Get the response code of the post
  * @param stream the stream the response is going to come from
  * @return the response string
  * @throws ManifoldCFException
  */
  protected String getResponse(BufferedReader stream) throws ManifoldCFException, ServiceInterruption
  {
    Logging.ingest.debug("Waiting for response stream");
    StringBuilder res = new StringBuilder();
    try
    {
      // Stream.ready() always returns false for secure sockets :-(.  So
      // we have to rely on socket timeouts to interrupt us if the server goes down.

      while (true)
      {
        int i = stream.read();
        if (i == -1)
          break;
        res.append((char) i);
      }
      Logging.ingest.debug("Read of response stream complete");
    }
    catch (java.net.SocketTimeoutException e)
    {
      // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
      long currentTime = System.currentTimeMillis();
      throw new ServiceInterruption("Ingestion API socket timeout exception waiting for response code: "+e.getMessage()+"; ingestion will be retried again later",
        e,
        currentTime + interruptionRetryTime,
        currentTime + 2L * 60L * 60000L,
        -1,
        true);
    }
    catch (InterruptedIOException e)
    {
      throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
    }
    catch (java.net.ConnectException e)
    {
      // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
      long currentTime = System.currentTimeMillis();
      throw new ServiceInterruption("Timed out connecting to ingestion API: "+e.getMessage()+"; ingestion will be retried again later",
        e,
        currentTime + interruptionRetryTime,
        currentTime + 2L * 60L * 60000L,
        -1,
        true);
    }
    catch (java.net.SocketException e)
    {
      // Return 400 error; likely a connection reset which lost us the response data, so
      // just treat it as something OK.
      return "HTTP/1.0 400 Connection Reset";

    }
    catch (IOException ioe)
    {
      Logging.ingest.warn("IO exception trying to get response from ingestion API: "+ioe.getMessage(),ioe);
      // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
      long currentTime = System.currentTimeMillis();
      throw new ServiceInterruption("IO exception waiting for response code: "+ioe.getMessage()+"; ingestion will be retried again later",
        ioe,
        currentTime + interruptionRetryTime,
        currentTime + 2L * 60L * 60000L,
        -1,
        true);
    }

    return res.toString();
  }

  /** Write credentials to output */
  protected void writeCredentials(OutputStream out)
    throws IOException
  {
    // Apply credentials if present
    if (encodedCredentials != null)
    {
      Logging.ingest.debug("Applying credentials");
      byte[] tmp = ("Authorization: Basic " + encodedCredentials + "\r\n").getBytes(StandardCharsets.UTF_8);
      out.write(tmp, 0, tmp.length);

      tmp = ("WWW-Authenticate: Basic realm=\"" + ((realm != null) ? realm : "") + "\"\r\n").getBytes(StandardCharsets.UTF_8);
      out.write(tmp, 0, tmp.length);
    }
  }

  /** Encode for metadata.
  *@param inputString is the input string.
  *@return output, encoded.
  */
  protected static String metadataEncode(String inputString)
  {
    StringBuilder rval = new StringBuilder();
    int i = 0;
    while (i < inputString.length())
    {
      char x = inputString.charAt(i++);
      // Certain characters must simply be skipped, because they are illegal in header fields.
      if (x >= ' ' && x <= (char)127)
      {
        if (x == '\\' || x == ',')
          rval.append('\\');
        rval.append(x);
      }
    }
    return rval.toString();
  }

  /** Build a secure socket factory based on no keystore and a lax trust manager.
  * This allows use of SSL for privacy but not identification. */
  protected static javax.net.ssl.SSLSocketFactory getSecureSocketFactory()
    throws ManifoldCFException
  {
    try
    {
      java.security.SecureRandom secureRandom = java.security.SecureRandom.getInstance("SHA1PRNG");

      // Create an SSL context
      javax.net.ssl.SSLContext sslContext = javax.net.ssl.SSLContext.getInstance("SSL");
      sslContext.init(null,new LaxTrustManager[]{new LaxTrustManager()},secureRandom);
      return sslContext.getSocketFactory();
    }
    catch (java.security.NoSuchAlgorithmException e)
    {
      throw new ManifoldCFException("No such algorithm",e);
    }
    catch (java.security.KeyManagementException e)
    {
      throw new ManifoldCFException("Key management exception",e);
    }
  }

  /** Create a socket in a manner consistent with all of our specified parameters.
  */
  protected Socket createSocket(long responseRetryCount)
    throws IOException, ManifoldCFException
  {
    Socket socket;
    if (protocol.equals("https"))
    {
      try
      {
        SocketFactory factory = SSLSocketFactory.getDefault();
        socket = factory.createSocket(host,port);
      }
      catch (InterruptedIOException e)
      {
        throw e;
      }
      catch (IOException e)
      {
        throw new ManifoldCFException("Couldn't set up SSL connection to ingestion API: "+e.getMessage(),e);
      }
    }
    else
      socket = new Socket(host, port);

    // Calculate the timeout we want
    long timeoutMilliseconds = responseRetryWait * responseRetryCount;
    socket.setSoTimeout((int)timeoutMilliseconds);

    return socket;
  }

  /** Our own trust manager, which ignores certificate issues */
  protected static class LaxTrustManager implements X509TrustManager
  {
    /** Does nothing */
    public LaxTrustManager()
    {
    }

    /** Return a list of accepted issuers.  There are none. */
    public java.security.cert.X509Certificate[] getAcceptedIssuers()
    {
      return new java.security.cert.X509Certificate[0];
    }

    /** We have no problem with any clients */
    public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType)
      throws java.security.cert.CertificateException
    {
    }

    /** We have no problem with any servers */
    public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType)
      throws java.security.cert.CertificateException
    {
    }

  }

  /** Killable thread that does ingestions.
  * Java 1.5 stopped permitting thread interruptions to abort socket waits.  As a result, it is impossible to get threads to shutdown cleanly that are doing
  * such waits.  So, the places where this happens are segregated in their own threads so that they can be just abandoned.
  *
  * This thread does a single document ingestion.
  */
  protected class IngestThread extends java.lang.Thread
  {
    protected String documentURI;
    protected String aclXmlString;
    protected String[] collections;
    protected String documentTemplate;
    protected RepositoryDocument document;

    protected Long activityStart = null;
    protected Long activityBytes = null;
    protected String activityCode = null;
    protected String activityDetails = null;
    protected Throwable exception = null;
    protected boolean readFromDocumentStreamYet = false;
    protected boolean rval = false;

    public IngestThread(String documentURI, String aclXmlString, String[] collections, String documentTemplate, RepositoryDocument document)
    {
      super();
      setDaemon(true);
      this.documentURI = documentURI;
      this.aclXmlString = aclXmlString;
      this.collections = collections;
      this.documentTemplate = documentTemplate;
      this.document = document;
    }

    public void run()
    {
      long length = document.getBinaryLength();
      InputStream is = document.getBinaryStream();

      try
      {
        // Do the operation!
        long fullStartTime = System.currentTimeMillis();

        // Open a socket to ingest, and to the response stream to get the post result
        try
        {
          // Set up the socket, and the (optional) secure socket.
          long responseRetryCount = responseRetries + (long)((float)length * sizeCoefficient);
          Socket socket = createSocket(responseRetryCount);

          try
          {

            InputStreamReader isr = new InputStreamReader(socket.getInputStream(),"ASCII");
            try
            {
              BufferedReader in = new BufferedReader(isr);
              try
              {
                OutputStream out = socket.getOutputStream();
                try
                {
                  // Create the output stream to GTS
                  String uri = url.getFile();
                  if (uri.length() == 0)
                    uri = "/";
                  byte[] tmp = ("POST " + uri + " HTTP/1.0\r\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  // Set all the headers
                  tmp = ("Document-URI: " + documentURI + "\r\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  writeCredentials(out);

                  // Apply ACL if present
                  if (aclXmlString.length() > 0)
                  {

                    String encodedACL = new Base64().encodeByteArray(aclXmlString.getBytes(StandardCharsets.UTF_8));

                    // Break into chunks - 4K each - 'cause otherwise we blow up the ingester.
                    int index = 0;
                    while (true)
                    {
                      if (index + HEADER_CHUNK >= encodedACL.length())
                      {
                        tmp = ("Document-ACL: " + encodedACL.substring(index) + "\r\n").getBytes(StandardCharsets.UTF_8);
                        out.write(tmp, 0, tmp.length);
                        break;
                      }
                      tmp = ("Document-ACL: " + encodedACL.substring(index,index + HEADER_CHUNK) + "\r\n").getBytes(StandardCharsets.UTF_8);
                      out.write(tmp, 0, tmp.length);
                      index += HEADER_CHUNK;
                    }
                  }

                  // Do the collections
                  if (collections != null)
                  {
                    int index = 0;
                    while (index < collections.length)
                    {
                      String collectionName = collections[index++];
                      String encodedValue = metadataEncode(collectionName);
                      //System.out.println("collection metadata: collection_name = '"+encodedValue+"'");
                      tmp = ("Document-Metadata: collection_name="+encodedValue+"\r\n").getBytes(StandardCharsets.UTF_8);
                      out.write(tmp, 0, tmp.length);
                    }
                  }

                  // Do the document template
                  if (documentTemplate != null && documentTemplate.length() > 0)
                  {
                    String encodedTemplate = new Base64().encodeByteArray(documentTemplate.getBytes(StandardCharsets.UTF_8));
                    // Break into chunks - 4K each - 'cause otherwise we blow up the ingester.
                    int index = 0;
                    while (true)
                    {
                      if (index + HEADER_CHUNK >= encodedTemplate.length())
                      {
                        tmp = ("Document-Template: " + encodedTemplate.substring(index) + "\r\n").getBytes(StandardCharsets.UTF_8);
                        out.write(tmp, 0, tmp.length);
                        break;
                      }
                      tmp = ("Document-Template: " + encodedTemplate.substring(index,index + HEADER_CHUNK) + "\r\n").getBytes(StandardCharsets.UTF_8);
                      out.write(tmp, 0, tmp.length);
                      index += HEADER_CHUNK;
                    }
                  }

                  // Write all the metadata, if any
                  Iterator<String> iter = document.getFields();
                  while (iter.hasNext())
                  {
                    String fieldName = iter.next();
                    String[] values = document.getFieldAsStrings(fieldName);
                    // We only handle strings right now!!!
                    int k = 0;
                    while (k < values.length)
                    {
                      String value = (String)values[k++];

                      String encodedValue = metadataEncode(value);
                      //System.out.println("Metadata: Name = '"+fieldName+"', value = '"+encodedValue+"'");
                      tmp = ("Document-Metadata: "+ fieldName+"="+encodedValue+"\r\n").getBytes(StandardCharsets.UTF_8);
                      out.write(tmp, 0, tmp.length);
                    }
                  }

                  tmp = ("Content-length: " + new Long(length).toString() + "\r\n\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  long total = 0;
                  long now, later;
                  now = System.currentTimeMillis();

                  byte[] bytes = new byte[buffersize];

                  // Write out the contents of the inputstream to the socket
                  while (true)
                  {
                    int count;
                    // Specially catch all errors that come from reading the input stream itself.
                    // This will help us segregate errors that come from the stream vs. those that come from the ingestion system.
                    try
                    {
                      count = is.read(bytes);
                    }
                    catch (java.net.SocketTimeoutException ioe)
                    {
                      // We have to catch socket timeout exceptions specially, because they are derived from InterruptedIOException
                      // They are otherwise just like IOExceptions

                      // Log the error
                      Logging.ingest.warn("Error reading data for transmission to Ingestion API: "+ioe.getMessage(),ioe);

                      activityStart = new Long(fullStartTime);
                      activityCode = "-1";
                      activityDetails = "Couldn't read document: "+ioe.getMessage();

                      // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
                      long currentTime = System.currentTimeMillis();
                      throw new ServiceInterruption("IO error reading document for ingestion: "+ioe.getMessage()+"; read will be retried again later",
                        ioe,
                        currentTime + interruptionRetryTime,
                        currentTime + 2L * 60L * 60000L,
                        -1,
                        true);

                    }
                    catch (InterruptedIOException ioe)
                    {
                      // If the transfer was interrupted, it may be because we are shutting down the thread.

                      // Third-party library exceptions derived from InterruptedIOException are possible; if the stream comes from httpclient especially.
                      // If we see one of these, we treat it as "not an interruption".
                      if (!ioe.getClass().getName().equals("java.io.InterruptedIOException"))
                      {
                        // Log the error
                        Logging.ingest.warn("Error reading data for transmission to Ingestion API: "+ioe.getMessage(),ioe);

                        activityStart = new Long(fullStartTime);
                        activityCode = "-1";
                        activityDetails = "Couldn't read document: "+ioe.getMessage();

                        // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
                        long currentTime = System.currentTimeMillis();
                        throw new ServiceInterruption("IO error reading document for ingestion: "+ioe.getMessage()+"; read will be retried again later",
                          ioe,
                          currentTime + interruptionRetryTime,
                          currentTime + 2L * 60L * 60000L,
                          -1,
                          true);
                      }
                      else
                        throw ioe;
                    }
                    catch (IOException ioe)
                    {
                      // We need to decide whether to throw a service interruption or metacarta exception, based on what went wrong.
                      // We never retry here; the cause is the repository, so there's not any point.

                      // Log the error
                      Logging.ingest.warn("Error reading data for transmission to Ingestion API: "+ioe.getMessage(),ioe);

                      activityStart = new Long(fullStartTime);
                      activityCode = "-1";
                      activityDetails = "Couldn't read document: "+ioe.getMessage();

                      // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
                      long currentTime = System.currentTimeMillis();
                      throw new ServiceInterruption("IO error reading document for ingestion: "+ioe.getMessage()+"; read will be retried again later",
                        ioe,
                        currentTime + interruptionRetryTime,
                        currentTime + 2L * 60L * 60000L,
                        -1,
                        true);
                    }

                    if (count == -1)
                      break;
                    readFromDocumentStreamYet = true;
                    out.write(bytes,0,count);
                    total += (long)count;
                  }

                  later = System.currentTimeMillis();
                  if (Logging.ingest.isDebugEnabled())
                    Logging.ingest.debug("Total bytes posted: " + new Long(total).toString() + ", total time: " + (later - now));

                  out.flush();

                  // Now, process response
                  String res;
                  try
                  {
                    res = getResponse(in);
                  }
                  catch (ServiceInterruption si)
                  {
                    activityStart = new Long(now);
                    activityCode = "-2";
                    activityDetails = si.getMessage();
                    throw si;
                  }

                  if (Logging.ingest.isDebugEnabled())
                    Logging.ingest.debug("Response code from ingest: '" + res + "'");

                  CodeDetails cd = new CodeDetails(res);

                  activityStart = new Long(now);
                  activityBytes = new Long(length);
                  activityCode = cd.getCode();
                  activityDetails = cd.getDetails();

                  int codeValue = cd.getCodeValue();

                  // A negative number means http error of some kind.
                  if (codeValue < 0)
                    throw new ManifoldCFException("Http protocol error");

                  // 200 means everything went OK
                  if (codeValue == 200)
                  {
                    rval = true;
                    return;
                  }

                  // Anything else means the document didn't ingest.
                  // There are three possibilities here:
                  // 1) The document will NEVER ingest (it's illegal), in which case a 400 or 403 will be returned, and
                  // 2) There is a transient error, in which case we will want to try again, after a wait.
                  //    If the situation is (2), then we CAN'T retry if we already read any of the stream; therefore
                  //    we are forced to throw a "service interrupted" exception, and let the caller reschedule
                  //    the ingestion.
                  // 3) Something is wrong with the setup, e.g. bad credentials.  In this case we chuck a ManifoldCFException,
                  //    since this will abort the current activity entirely.

                  if (codeValue == 401)
                    throw new ManifoldCFException("Bad credentials for ingestion",ManifoldCFException.SETUP_ERROR);

                  if (codeValue >= 400 && codeValue < 500)
                  {
                    rval = false;
                    return;
                  }

                  // If this continues, we should indeed abort the job.  Retries should not go on indefinitely either; 2 hours is plenty
                  long currentTime = System.currentTimeMillis();
                  throw new ServiceInterruption("Error "+Integer.toString(codeValue)+" from ingestion request; ingestion will be retried again later",
                    new ManifoldCFException("Ingestion HTTP error code "+Integer.toString(codeValue)),
                    currentTime + interruptionRetryTime,
                    currentTime + 2L * 60L * 60000L,
                    -1,
                    true);
                }
                finally
                {
                  out.close();
                }
              }
              finally
              {
                in.close();
              }
            }
            finally
            {
              isr.close();
            }
          }
          finally
          {
            try
            {
              socket.close();
            }
            catch (InterruptedIOException e)
            {
              throw e;
            }
            catch (IOException e)
            {
              Logging.ingest.debug("Error closing socket: "+e.getMessage(),e);
              // Do NOT rethrow
            }
          }
        }
        catch (java.net.SocketTimeoutException ioe)
        {
          // These are just like IO errors, but since they are derived from InterruptedIOException, they have to be caught first.
          // Log the error
          Logging.ingest.warn("Error connecting to ingestion API: "+ioe.getMessage(),ioe);

          activityStart = new Long(fullStartTime);
          activityCode = "-1";
          activityDetails = ioe.getMessage();

          throw ioe;
        }
        catch (InterruptedIOException e)
        {
          return;
        }
        catch (IOException ioe)
        {
          activityStart = new Long(fullStartTime);

          // Intercept "broken pipe" exception, since that seems to be what we get if the ingestion API kills the socket right after a 400 goes out.
          // Basically, we have no choice but to interpret that in the same manner as a 400, since no matter how we do it, it's a race and the 'broken pipe'
          // result is always possible.  So we might as well expect it and treat it properly.
          //
          if (ioe.getClass().getName().equals("java.net.SocketException") && ioe.getMessage().toLowerCase().indexOf("broken pipe") != -1)
          {
            // We've seen what looks like the ingestion interface forcibly closing the socket.
            // We *choose* to interpret this just like a 400 response.  However, we log in the history using a different code,
            // since we really don't know what happened for sure.
            // Record the attempt

            activityCode = "-103";
            activityDetails = "Presuming an ingestion rejection: "+ioe.getMessage();
            rval = false;
            return;
          }

          // Record the attempt
          activityCode = "-1";
          activityDetails = ioe.getMessage();

          // Log the error
          Logging.ingest.warn("Error communicating with Ingestion API: "+ioe.getMessage(),ioe);

          throw ioe;
        }
      }
      catch (Throwable e)
      {
        this.exception = e;
      }
    }

    public Throwable getException()
    {
      return exception;
    }

    public Long getActivityStart()
    {
      return activityStart;
    }

    public Long getActivityBytes()
    {
      return activityBytes;
    }

    public String getActivityCode()
    {
      return activityCode;
    }

    public String getActivityDetails()
    {
      return activityDetails;
    }

    public boolean getReadFromDocumentStreamYet()
    {
      return readFromDocumentStreamYet;
    }

    public boolean getRval()
    {
      return rval;
    }
  }

  /** Killable thread that does deletions.
  * Java 1.5 stopped permitting thread interruptions to abort socket waits.  As a result, it is impossible to get threads to shutdown cleanly that are doing
  * such waits.  So, the places where this happens are segregated in their own threads so that they can be just abandoned.
  *
  * This thread does a single document deletion.
  */
  protected class DeleteThread extends java.lang.Thread
  {
    protected String documentURI;

    protected Long activityStart = null;
    protected String activityCode = null;
    protected String activityDetails = null;
    protected Throwable exception = null;

    public DeleteThread(String documentURI)
    {
      super();
      setDaemon(true);
      this.documentURI = documentURI;
    }

    public void run()
    {
      try
      {
        // Do the operation!
        long fullStartTime = System.currentTimeMillis();
        // Open a socket to ingest, and to the response stream to get the post result
        try
        {
          // Set up the socket, and the (optional) secure socket.
          Socket socket = createSocket(responseRetries);
          try
          {
            InputStreamReader isr = new InputStreamReader(socket.getInputStream(),"ASCII");
            try
            {
              BufferedReader in = new BufferedReader(isr);
              try
              {
                OutputStream out = socket.getOutputStream();
                try
                {
                  long startTime = System.currentTimeMillis();
                  // Create the output stream to GTS
                  byte[] tmp = ("POST " + deleteURL.getFile() + " HTTP/1.0\r\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  // Set all the headers
                  tmp = ("Document-URI: " + documentURI + "\r\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  writeCredentials(out);

                  tmp = ("Content-length: 0\r\n\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  if (Logging.ingest.isDebugEnabled())
                    Logging.ingest.debug("Delete posted");

                  out.flush();

                  String res;
                  try
                  {
                    res = getResponse(in);
                  }
                  catch (ServiceInterruption si)
                  {
                    activityStart = new Long(startTime);
                    activityCode = "-2";
                    activityDetails = si.getMessage();
                    throw si;
                  }

                  if (Logging.ingest.isDebugEnabled())
                    Logging.ingest.debug("Response code from delete: '" + res + "'");

                  CodeDetails cd = new CodeDetails(res);

                  activityStart = new Long(startTime);
                  activityCode = cd.getCode();
                  activityDetails = cd.getDetails();

                  int codeValue = cd.getCodeValue();

                  if (codeValue < 0)
                    throw new ManifoldCFException("Http protocol error");

                  // 200 means everything went OK
                  if (codeValue == 200)
                    return;

                  // We ignore everything in the range from 400-500 now
                  if (codeValue == 401)
                    throw new ManifoldCFException("Bad credentials for ingestion",ManifoldCFException.SETUP_ERROR);

                  if (codeValue >= 400 && codeValue < 500)
                    return;

                  // Anything else means the document didn't delete.  Throw the error.
                  throw new ManifoldCFException("Error deleting document: '"+res+"'");
                }
                finally
                {
                  out.close();
                }
              }
              finally
              {
                in.close();
              }
            }
            finally
            {
              isr.close();
            }
          }
          finally
          {
            try
            {
              socket.close();
            }
            catch (InterruptedIOException e)
            {
              throw e;
            }
            catch (IOException e)
            {
              Logging.ingest.debug("Error closing socket: "+e.getMessage(),e);
              // Do NOT rethrow
            }
          }
        }
        catch (InterruptedIOException ioe)
        {
          return;
        }
        catch (IOException ioe)
        {
          // Log the error
          Logging.ingest.warn("Error communicating with Ingestion API: "+ioe.getMessage(),ioe);

          activityStart = new Long(fullStartTime);
          activityCode = "-1";
          activityDetails = ioe.getMessage();

          throw ioe;
        }
      }
      catch (Throwable e)
      {
        this.exception = e;
      }
    }

    public Throwable getException()
    {
      return exception;
    }

    public Long getActivityStart()
    {
      return activityStart;
    }

    public String getActivityCode()
    {
      return activityCode;
    }

    public String getActivityDetails()
    {
      return activityDetails;
    }
  }

  /** Killable thread that does a status check.
  * Java 1.5 stopped permitting thread interruptions to abort socket waits.  As a result, it is impossible to get threads to shutdown cleanly that are doing
  * such waits.  So, the places where this happens are segregated in their own threads so that they can be just abandoned.
  *
  * This thread does a status check.
  */
  protected class StatusThread extends java.lang.Thread
  {
    protected Throwable exception = null;

    public StatusThread()
    {
      super();
      setDaemon(true);
    }

    public void run()
    {
      try
      {
        // Do the operation!
        // Open a socket to ingest, and to the response stream to get the post result
        try
        {
          // Set up the socket, and the (optional) secure socket.
          Socket socket = createSocket(responseRetries);
          try
          {
            InputStreamReader isr = new InputStreamReader(socket.getInputStream(),"ASCII");
            try
            {
              BufferedReader in = new BufferedReader(isr);
              try
              {
                OutputStream out = socket.getOutputStream();
                try
                {
                  // Create the output stream to GTS
                  byte[] tmp = ("GET " + infoURL.getFile() + " HTTP/1.0\r\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  writeCredentials(out);

                  tmp = ("Content-length: 0\r\n\n").getBytes(StandardCharsets.UTF_8);
                  out.write(tmp, 0, tmp.length);

                  if (Logging.ingest.isDebugEnabled())
                    Logging.ingest.debug("Status request posted");

                  out.flush();

                  String res = getResponse(in);

                  if (Logging.ingest.isDebugEnabled())
                    Logging.ingest.debug("Response code from delete: '" + res + "'");

                  CodeDetails cd = new CodeDetails(res);

                  int codeValue = cd.getCodeValue();
                  if (codeValue < 0)
                    throw new ManifoldCFException("Http protocol error");

                  // 200 means everything went OK
                  if (codeValue == 200)
                    return;

                  // We ignore everything in the range from 400-500 now
                  if (codeValue == 401)
                    throw new ManifoldCFException("Bad credentials for ingestion",ManifoldCFException.SETUP_ERROR);

                  // Anything else means the info request failed.
                  throw new ManifoldCFException("Error connecting to MetaCarta ingestion API: '"+res+"'");
                }
                finally
                {
                  out.close();
                }
              }
              finally
              {
                in.close();
              }
            }
            finally
            {
              isr.close();
            }
          }
          finally
          {
            try
            {
              socket.close();
            }
            catch (InterruptedIOException e)
            {
              throw e;
            }
            catch (IOException e)
            {
              Logging.ingest.debug("Error closing socket: "+e.getMessage(),e);
              // Do NOT rethrow
            }
          }
        }
        catch (InterruptedIOException ioe)
        {
          // Exit the thread.
          return;
        }
        catch (IOException ioe)
        {
          // Log the error
          Logging.ingest.warn("Error communicating with Ingestion API: "+ioe.getMessage(),ioe);
          throw ioe;
        }
      }
      catch (Throwable e)
      {
        this.exception = e;
      }
    }

    public Throwable getException()
    {
      return exception;
    }
  }

  /** Code+details paper object */
  protected static class CodeDetails
  {
    protected String code;
    protected int codeValue;
    protected String details;

    public CodeDetails(String res)
    {
      codeValue = -100;
      code = "-100";
      details = "Http response was improperly formed";

      int firstSpace = res.indexOf(" ");
      if (firstSpace != -1)
      {
        int secondSpace = res.indexOf(" ", firstSpace + 1);
        if (secondSpace != -1)
        {
          code = res.substring(firstSpace + 1, secondSpace);
          details = res.substring(secondSpace+1).trim();
          try
          {
            codeValue = (int)(new Double(code).doubleValue());
            if (codeValue == 200)
              details = null;
          }
          catch (NumberFormatException e)
          {
            // Fall through and leave codeValue unaltered
          }
        }
      }
    }

    public String getCode()
    {
      return code;
    }

    public int getCodeValue()
    {
      return codeValue;
    }

    public String getDetails()
    {
      return details;
    }

  }

}
TOP

Related Classes of org.apache.manifoldcf.agents.output.gts.HttpPoster

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.