Package org.docx4j.model.fields

Source Code of org.docx4j.model.fields.FieldsPreprocessor

package org.docx4j.model.fields;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.namespace.QName;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.stream.StreamSource;

import org.docx4j.XmlUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.parts.JaxbXmlPart;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.FldChar;
import org.docx4j.wml.P;
import org.docx4j.wml.ProofErr;
import org.docx4j.wml.R;
import org.docx4j.wml.STFldCharType;
import org.docx4j.wml.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* This class puts fields into a "canonical" representation
* (see FieldRef for description).
*
* It does this in 2 steps:
* - step 1: use XSLT to convert simple fields into complex ones
* - step 2: put all the instructions into a single run
*
* Currently the canonicalisation is done at the paragraph level,
* so it is not suitable for fields (such as TOC) which extend across paragraphs.
* TOC will need to be regenerated (using Word) if touched by canonicalisation.
*
* @author jharrop
*
*/
public class FieldsPreprocessor {
 
  private static Logger log = LoggerFactory.getLogger(FieldsPreprocessor.class);   

    private final static QName _RInstrText_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "instrText");
    private final static QName _PHyperlink_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "hyperlink");
   
 
  static Templates xslt;     
  static {
    try {
      Source xsltSource = new StreamSource(
            org.docx4j.utils.ResourceUtils.getResource(
                "org/docx4j/model/fields/FieldsSimpleToComplex.xslt"));
      xslt = XmlUtils.getTransformerTemplate(xsltSource);
    } catch (IOException e) {
      e.printStackTrace();
    } catch (TransformerConfigurationException e) {
      e.printStackTrace();
    }
   
  }
 
  private FieldsPreprocessor(List<FieldRef> fieldRefs) {
    this.fieldRefs = fieldRefs;
  }

  /**
   * Convert any w:fldSimple in this part to complex field.
   * @param part
   * @throws Docx4JException
   */
  public static void complexifyFields(JaxbXmlPart part) throws Docx4JException {
   
    org.w3c.dom.Document doc = XmlUtils.marshaltoW3CDomDocument(
        part.getJaxbElement() );  
   
//    XPathsPart xPathsPart = null;
       
    JAXBContext jc = Context.jc;
    try {
      // Use constructor which takes Unmarshaller, rather than JAXBContext,
      // so we can set JaxbValidationEventHandler
      Unmarshaller u = jc.createUnmarshaller();
      u.setEventHandler(new org.docx4j.jaxb.JaxbValidationEventHandler());
      javax.xml.bind.util.JAXBResult result = new javax.xml.bind.util.JAXBResult(u );
               
      org.docx4j.XmlUtils.transform(doc, xslt, null, result);
     
      part.setJaxbElement(result);
    } catch (Exception e) {
      throw new Docx4JException("Problems transforming fields", e);     
    }
       
  }
 
 
  public static P canonicalise(P p, List<FieldRef> fieldRefs) {
    /*
     * Result is something like:
     *
            <w:p>
                <w:r>
                    <w:fldChar w:fldCharType="begin"/>
                    <w:instrText xml:space="preserve"> DATE  </w:instrText>
                    <w:fldChar w:fldCharType="separate"/>
                </w:r>
                <w:r>
                    <w:t>4/12/2011</w:t>
                </w:r>
                <w:r>
                    <w:fldChar w:fldCharType="end"/>
                </w:r>
            </w:p>     
    
     * Note that the content between begin and separate could be more complex
     * including nested fields.
     **/
   
   
    FieldsPreprocessor fp = new FieldsPreprocessor(fieldRefs);
    return fp.canonicaliseInstance(p);
  }
 
  private P canonicaliseInstance(P p) {

    P newP = Context.getWmlObjectFactory().createP();
    newP.setPPr(p.getPPr());
   
    newR = Context.getWmlObjectFactory().createR();
//    fieldRPr = null;
   
    stack = new LinkedList<FieldRef>();
   
    log.debug(XmlUtils.marshaltoString(p));
    handleContent(p.getContent(), newP);

    // log.debug(XmlUtils.marshaltoString(newP, true));

    return newP;
  }
 
  /**
   * A list of FieldRef objects representing outermost fields
   * only.
   */
  private List<FieldRef> fieldRefs;
 
 
  private LinkedList<FieldRef> stack;
  private FieldRef currentField=null;
 
  private R newR;
 
  private void handleContent(List<Object> objects, ContentAccessor attachmentPoint) {
    // handles case where the run(s) containing the field are inside a P, or inside a P.Hyperlink
    // (eg a PAGEREF in a table of contents).
   
    for (Object o : objects ) {

      // Handling for hyperlink (can occur in field result, and might contain another
      // nested field).  Since at present the field processing here is for
      // MERGEFIELD and DOCPROPERTY fields, this is currently just handled by else below.
     
      //  if ( o instanceof P.Hyperlink
      //      || ((o instanceof JAXBElement
      //          && ((JAXBElement)o).getName().equals(_PHyperlink_QNAME)) )  ) {
      // 
     
     
      if ( o instanceof R ) {
       
        R existingRun = (R)o;
        handleRun(existingRun, attachmentPoint);

      } else if (o instanceof ProofErr) {
        // Ignore
        // What happens if we ignore eg grammarStart, but its matching
        // grammarEnd is outside and retained?
        // Well, a stray spellStart doesn't matter to Word 2010, so
        // assume others would be ok as well.
      } else {
        // its not something we're interested in
       
        log.debug("Retaining" + XmlUtils.unwrap(o).getClass().getName());

        attachmentPoint.getContent().add(o);

        // prepare new run
        newR = Context.getWmlObjectFactory().createR();
       
      }

//      if (newR.getContent().size() > 0 && !attachmentPoint.getContent().contains(newR)) {
//        attachmentPoint.getContent().add(newR);
//      }
     
    }
   
  }
 
  private boolean fieldIsTopLevel() {
    return stack.size()==1;
  }
 
  private boolean inParentResult() {
   
    FieldRef thisField = stack.pop();
    try {
      FieldRef parentField = stack.pop();
      boolean inResult = parentField.haveSeenSeparate();
      // restore stack
      stack.push(parentField);
      stack.push(thisField);
      return inResult;
    } catch (NoSuchElementException e) {
      // No parent
      // restore stack
      stack.push(thisField);
      return false;
    }
   
  }
 
  /**
   * Its preserved, if it is locked.
   *
   * If it isn't locked, it is preserved unless its a MERGEFIELD or a DOCPROPERTY field.
   *
   * @param fieldRef
   * @return
   */
  private boolean preserveResult(FieldRef fieldRef) {
   
    if (fieldRef.isLock()) return true;
   
   
    String fldName = fieldRef.getFldName();
    if (fldName==null) return true;
   
    if (fldName.equals("MERGEFIELD")
        || fldName.equals("DOCPROPERTY")) {
      return false;
    }
    return true;
  }

  private boolean preserveParentResult() {
   
    FieldRef thisField = stack.pop();
    FieldRef parentField = stack.pop();
    boolean preserveParentResult = preserveResult(parentField);
    // restore stack
    stack.push(parentField);
    stack.push(thisField);
    return preserveParentResult;
  }
 
  private void handleRun(R existingRun, ContentAccessor newAttachPoint) {
   
    // note that the newR object persists between invocations of this method,
    // so you have to be careful to actually add it to the docx
    // before re-creating it
   
    log.debug("\nInput run: \n " + XmlUtils.marshaltoString(existingRun, true, true));
   
    for (Object o2 : existingRun.getContent() ) {
     
      newR.setRPr(existingRun.getRPr());

      if (isCharType(o2, STFldCharType.BEGIN)) {
       
        log.debug("\n\n begin.. ");

        // Setup a FieldRef object
        currentField = new FieldRef((FldChar)XmlUtils.unwrap(o2));             
        currentField.setParent(newAttachPoint);             
        currentField.setBeginRun(newR); // may as well do this
       
        stack.push(currentField);
       
        if (inParentResult()) {
         
          if (preserveParentResult()) {
            newR.getContent().add(o2);
          } else {
            log.debug(".. but in result, so don't add to run");
          }
         
        } else {
         
          if ( fieldIsTopLevel() ) {
         
            newR = Context.getWmlObjectFactory().createR();
            newR.getContent().add(o2);         
           
            currentField.setBeginRun(newR); // IMPORTANT, so we can delete it when we perform mail merge
           
            fieldRefs.add(currentField);         
          } else {
            newR.getContent().add(o2);
           
            stack.peek().getInstructions().add(currentField);
          }
        }
       
      } else if (isCharType(o2, STFldCharType.SEPARATE)) {
       
        currentField.setSeenSeparate(true);
       
        if (inParentResult()) {

          if (preserveParentResult()) {
            newR.getContent().add(o2);
          } else {
            log.debug(".. but in result, so don't add to run");
          }

        } else {
       
          newR.getContent().add(o2);
          if (!newAttachPoint.getContent().contains(newR)) {
            newAttachPoint.getContent().add(newR);
            log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
          }
         
          if ( fieldIsTopLevel() ) {
            // Top level field separator
           
            // Create result slot
            newR = Context.getWmlObjectFactory().createR();
            currentField.setResultsSlot(newR);
          }
        }
         
      } else if (isCharType(o2, STFldCharType.END)) {
       
        log.debug("\n\n .. end ");
       
        if (inParentResult()) {

          if (preserveParentResult()) {
           
            newR.getContent().add(o2);

            if (currentField.getFldName().equals("FORMTEXT")) {
              /*
               * Workaround for a bug in Word 2010.
               *
               * If you have multiple FORMTEXT in a single run,
               * for example:
               *
               *      <w:fldChar w:fldCharType="begin">
                        <w:ffData>
                          <w:name w:val="Text12"/>
                          <w:enabled/>
                          <w:calcOnExit w:val="false"/>
                          <w:textInput/>
                        </w:ffData>
                      </w:fldChar>
                      <w:instrText xml:space="preserve"> FORMTEXT </w:instrText>
                      <w:fldChar w:fldCharType="separate"/>
                      <w:t> </w:t>
                      <w:fldChar w:fldCharType="end"/>
                      <w:fldChar w:fldCharType="begin">
                        <w:ffData>
                          <w:name w:val="Text12"/>
                          <w:enabled/>
                          <w:calcOnExit w:val="false"/>
                          <w:textInput/>
                        </w:ffData>
                      </w:fldChar>
                      <w:instrText xml:space="preserve"> FORMTEXT </w:instrText>
                      <w:fldChar w:fldCharType="separate"/>
                      <w:t> </w:t>
                      <w:fldChar w:fldCharType="end"/>           
               *
               * Word 2010 does not display all the w:t elements (ie spaces appear to
               * be missing).
               *
               * Adding w:t/@xml:space="preserve" doesn't help.
               *
               * So the workaround here is to start a new run after each END tag.
               */
              if (!newAttachPoint.getContent().contains(newR)) {
                newAttachPoint.getContent().add(newR);
                log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
              }
              newR = Context.getWmlObjectFactory().createR();           
            }           
          } else {
            log.debug(".. but in result, so don't add to run");
          }

        } else // still in END processing

          if ( fieldIsTopLevel() ) {
         
            if (!currentField.haveSeenSeparate()) {
              // Word 2010 can produce a docx where:
              //  <w:r>
              //    <w:fldChar w:fldCharType="separate"/>
              //  </w:r>
              // is missing (valid per spec).
             
              // For top level fields only, we add this
              log.debug(".. ADDING SEP ..  ");
 
  //            R separateR = Context.getWmlObjectFactory().createR();             
              FldChar fldChar = Context.getWmlObjectFactory().createFldChar();
              fldChar.setFldCharType(STFldCharType.SEPARATE);
              newR.getContent().add(fldChar);
                         
              if (!newAttachPoint.getContent().contains(newR)) {
                newAttachPoint.getContent().add(newR);
                log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
              }
             
            }         
           
            // set up results slot - only for top-level fields
            newR = currentField.getResultsSlot(); // MERGEFORMAT processing below may have set this already
            if (newR==null) {
              newR = Context.getWmlObjectFactory().createR();
              currentField.setResultsSlot(newR);
            }
            if (!newAttachPoint.getContent().contains(newR)) { // test, since this is also done immediately before each loop ends
              newAttachPoint.getContent().add(newR);
              log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
            }
           
           
            // create a run specifically for end char
            newR = Context.getWmlObjectFactory().createR();
            newAttachPoint.getContent().add(newR);
            newR.getContent().add(o2);
            currentField.setEndRun(newR);
           
            //for whatever follows the field
            newR = Context.getWmlObjectFactory().createR();
           
          } else {
            newR.getContent().add(o2)
           

          }
         
        }
       
        stack.pop();
        currentField = stack.peek();
       
      } else if (currentField==null) {
          // run content before or after the field
          // - preserve this content
         
          newR.getContent().add(o2);

          if (!newAttachPoint.getContent().contains(newR)) {
            newAttachPoint.getContent().add(newR);
            log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
          }
       
          newR = Context.getWmlObjectFactory().createR();           
       
      } else if ( !currentField.haveSeenSeparate() ) {
       
        // Handles problems with empty w:instrText elements within complex field "begin" section
        Object o = XmlUtils.unwrap(o2);
        if (o instanceof Text && ((Text) o).getValue().trim().isEmpty()) {
          log.debug("Empty w:instrText found. Ignore it!");
          continue;
        }
       
       
//        log.debug("Processing " +((JAXBElement<Text>)o2).getValue().getValue() );
       
        currentField.getInstructions().add(o2);
        if (inParentResult()) {

          if (preserveParentResult()) {
            newR.getContent().add(o2);
          } else {
            log.debug(".. but in result, so don't add to run");
          }

        } else {       
          newR.getContent().add(o2);
        }

      } else if (preserveResult(currentField)) {
        // ie locked, or not MERGEFIELD, or DOCPROPERTY
        log.debug("preserveResult-> adding");
        newR.getContent().add(o2);   
       
        if (currentField.getResultsSlot()==null) {
          currentField.setResultsSlot(newR)// no harm in doing this - same as in SEPARATE processing?
        } else if (currentField.getResultsSlot()!=newR) {
          log.warn("Multiple runs in results slot?");
        }
       
      } else {
        // result content .. can ignore unless it has \* MERGEFORMAT
       
        // if \* MERGEFORMAT, attach the rPr of first run in the result
        if (o2 instanceof R
            && currentField.isMergeFormat()
            && currentField.getResultsSlot()==null) {

          R resultR = Context.getWmlObjectFactory().createR();
          currentField.setResultsSlot(resultR);
          resultR.setRPr(((R)o2).getRPr()); // could be null, but that's ok
          log.debug("MERGEFORMAT Set rPr");
        }
       
       
       
        // TODO: a TOC field usually has a PAGEREF wrapped in a hyperlink in its
        // result part.  We should either keep the entire result, or empty it.
        // only do this if the field has no nested field; we need a way to look ahead
        // to see whether a nested field is coming up)
       
        // we only want a single run between SEPARATOR and END,
        // and we added that in the SEPARATE stuff above
        log.debug("IGNORING " + XmlUtils.marshaltoString(o2, true, true));
       
      }

      // Doesn't solve the problem of Word failing to display some spaces.
//      if ( o2 instanceof Text
//          || ((o2 instanceof JAXBElement
//              && ((JAXBElement)o2).getName().equals(_RT_QNAME)) )  ) {
//        Text t = (Text)XmlUtils.unwrap(o2);
//        t.setSpace("preserve");
//      }
     
      if (newR.getContent().size() > 0 && !newAttachPoint.getContent().contains(newR)) {
        newAttachPoint.getContent().add(newR);
      }
     
    } // end for (Object o2 : existingRun.getContent() )
   
  }
   
      private final static QName _RT_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "t");
   
 
//  public static boolean containsCharType(Object o, STFldCharType charType) {
//   
//    if (o instanceof R) {
//      for (Object o2 : ((R)o).getContent() ) {
//       
//        if (isCharType(o2, charType)) {
//            return true;
//        }       
//      }
//    }
//    return false;
//  } 

  public static boolean isCharType(Object o2, STFldCharType charType) {
   
    o2 = XmlUtils.unwrap(o2);
   
    if (o2 instanceof org.docx4j.wml.FldChar) {
      FldChar fldChar = (FldChar)o2;
      if (fldChar.getFldCharType().equals(charType) ) {
               
        return true;
      } else {
        log.debug(fldChar.getFldCharType().toString());       
      }
    }
    return false;
 
 
}
TOP

Related Classes of org.docx4j.model.fields.FieldsPreprocessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.