Source Code of com.adobe.epubcheck.ctc.EpubHTML5StructureCheck

package com.adobe.epubcheck.ctc;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;


import com.adobe.epubcheck.api.Report;
import com.adobe.epubcheck.ctc.epubpackage.EpubPackage;
import com.adobe.epubcheck.ctc.epubpackage.ManifestItem;
import com.adobe.epubcheck.ctc.epubpackage.SpineItem;
import com.adobe.epubcheck.ctc.xml.HTMLTagsAnalyseHandler;
import com.adobe.epubcheck.ctc.xml.XMLContentDocParser;
import com.adobe.epubcheck.messages.MessageId;
import com.adobe.epubcheck.messages.MessageLocation;
import com.adobe.epubcheck.ocf.EncryptionFilter;
import com.adobe.epubcheck.opf.DocumentValidator;
import com.adobe.epubcheck.util.EPUBVersion;
import com.adobe.epubcheck.util.FeatureEnum;
import com.adobe.epubcheck.util.SearchDictionary;
import com.adobe.epubcheck.util.SearchDictionary.DictionaryType;


public class EpubHTML5StructureCheck implements DocumentValidator
{
  static final int hasHtml = 1;
  static final int hasPublic = 2;
  static final int hasW3C = 4;
  static final int hasXhtml = 8;
  static final int hasHTML5 = hasHtml;
  static final int hasHTML4 = hasPublic | hasW3C | hasXhtml;


  final ZipFile zip;
  final Report report;
  final EpubPackage epubPackage;
  final Hashtable<String, EncryptionFilter> enc;


  public EpubHTML5StructureCheck(EpubPackage epack, Report report)
  {
    this.zip = epack.getZip();
    this.report = report;
    this.epubPackage = epack;
    this.enc = new Hashtable<String, EncryptionFilter>();
  }


  @Override
  public boolean validate()
  {
    boolean result = false;
    SearchDictionary vtsd = new SearchDictionary(DictionaryType.VALID_TEXT_MEDIA_TYPES);
    int landmarkNavCount = 0;
    boolean isGlobalFixed = EpubPackage.isGlobalFixed(this.epubPackage);


    Hashtable<String, SpineItem> spineItems = new Hashtable<String, SpineItem>();
    for (int i = 0; i < epubPackage.getSpine().itemsLength(); ++i)
    {
      SpineItem si = epubPackage.getSpine().getItem(i);
      spineItems.put(si.getIdref(), si);
    }


    for (int i = 0; i < epubPackage.getManifest().itemsLength(); i++)
    {
      ManifestItem mi = epubPackage.getManifest().getItem(i);
      if (vtsd.isValidMediaType(mi.getMediaType()))
      {
        XMLContentDocParser parser = new XMLContentDocParser(epubPackage.getZip(), report);
        HTMLTagsAnalyseHandler sh = new HTMLTagsAnalyseHandler();
        sh.setReport(report);
        SpineItem si = spineItems.get(mi.getId());
        boolean itemIsFixedFormat = isGlobalFixed;
        if (si != null)
        {


          String properties = si.getProperties();
          if (properties != null)
          {
            if (properties != null && !properties.equals(""))
            {
              properties = properties.replaceAll("[\\s]+", " ");
              String propertyArray[] = properties.split(" ");
              for (String prop : propertyArray)
              {
                if (prop.equals("rendition:layout-pre-paginated"))
                {
                  itemIsFixedFormat = true;
                }
                else if (prop.equals("rendition:layout-reflowable"))
                {
                  itemIsFixedFormat = false;
                }
              }
            }
          }
          sh.setIsFixed(itemIsFixedFormat);
        }


        String fileToParse = epubPackage.getManifestItemFileName(mi);


        ZipEntry entry = zip.getEntry(fileToParse);
        if (entry == null)
        {
          String fileName = new File(zip.getName()).getName();
          report.message(MessageId.RSC_001, new MessageLocation(fileName, -1, -1), fileToParse);
          continue;
        }
        sh.setVersion(epubPackage.getVersion());
        sh.setFileName(fileToParse);
        //parser.parseDoc(fileToParse, sh);
        /***VALIDATE FILE EXTENSION***/


        String fileExtension = mi.getHref().substring(mi.getHref().lastIndexOf('.') + 1, mi.getHref().length());
        if (epubPackage.getVersion() == EPUBVersion.VERSION_2
            && !(fileExtension.compareToIgnoreCase("html") == 0
                || fileExtension.compareToIgnoreCase("htm") == 0 
                || fileExtension.compareToIgnoreCase("xhtml") == 0))
        {
          // Note: extension is already checked in OPFChecker30 for EPUB 3 
          report.message(MessageId.HTM_014, new MessageLocation(mi.getHref(), -1, -1));
        }


        /***VALIDATE DOCTYPE***/
        int docTypeMatches = findMatchingDocumentTypePatterns(fileToParse);


        if ((0 != (docTypeMatches & hasHTML4)) && (epubPackage.getVersion() == EPUBVersion.VERSION_3))
        {
          report.message(MessageId.HTM_015, new MessageLocation(mi.getHref(), -1, -1));
        }
        else if ((0 != (docTypeMatches & hasHTML5)) && ((hasXhtml != (docTypeMatches & hasXhtml)))  &&  (epubPackage.getVersion() == EPUBVersion.VERSION_2))
        {
          report.message(MessageId.HTM_016, new MessageLocation(mi.getHref(), -1, -1));
        }
        parser.parseDoc(fileToParse, sh);


        if (sh.getHtml5SpecTagsCounter() > 0)
        {
          report.info(fileToParse, FeatureEnum.HAS_HTML5, "true");
          if (epubPackage.isSpineItem(mi.getId()))
          {
            // Report that there is HTML5 for the entire publication only if it is in a spine item.
            // This is used for the 'is backward compatible' check.
            // This is so the HTML5 (nav tag) in a toc document will be ignored for backwards compatibility testing.
            report.info(null, FeatureEnum.HAS_HTML5, "true");
          }
        }
        landmarkNavCount += sh.getLandmarkNavCount();
      }
    }
    if (landmarkNavCount != 1 && epubPackage.getVersion() == EPUBVersion.VERSION_3)
    {
      File zipFile = new File(zip.getName());
      report.message(MessageId.ACC_008, new MessageLocation(zipFile.getName(), -1, -1));
    }


    return result;
  }


  InputStream getInputStream(String name) throws
      IOException
  {
    ZipEntry entry = zip.getEntry(name);
    if (entry == null)
    {
      return null;
    }
    InputStream in = zip.getInputStream(entry);
    EncryptionFilter filter = enc.get(name);
    if (filter == null)
    {
      return in;
    }
    if (filter.canDecrypt())
    {
      return filter.decrypt(in);
    }
    return null;
  }


  static final Pattern patternDocTypeElement = Pattern.compile("<*!*[Dd][Oo][Cc][Tt][Yy][Pp][Ee]");
  static final Pattern patternHtmlElement = Pattern.compile("([^Xx][Hh][Tt][Mm][Ll])");
  static final Pattern patternPublicElement = Pattern.compile("[Pp][Uu][Bb][Ll][Ii][Cc]");
  static final Pattern patternW3CElement = Pattern.compile("[Ww][3][Cc]//[Dd][Tt][Dd]");
  static final Pattern patternXhtmlElement = Pattern.compile("[Xx][Hh][Tt][Mm][Ll]");


  int findMatchingDocumentTypePatterns(String entry)
  {
    InputStream is = null;
    int matchingPatterns = 0;
    try
    {
      is = getInputStream(entry);
      if (is == null)
      {
        throw new IOException("Input Stream not found: '" + entry + "'");
      }


      Scanner in = new Scanner(is);
      StringBuilder sb = new StringBuilder();
      int numBracketsToClose = 0;
      String line = null;


      // skip over every line until we find the !DOCTYPE
      while (in.hasNextLine())
      {
        line = in.nextLine();
        Matcher matcher = patternDocTypeElement.matcher(line);
        if (matcher.find())
        {
          numBracketsToClose = 1;
          int i = matcher.start();
          if (i >= 0)
          {
            // prime it here so we can enter the loop below
            sb.append("<");
            line = line.substring(i + 1);
            break;
          }
        }
      }


      // now start appending characters until we close all nested '<' with matching '>'
      while ((numBracketsToClose > 0) && (line != null))
      {
        int i = 0;
        while ((numBracketsToClose > 0) && (i < line.length()))
        {
          Character ch = line.charAt(i);
          if (ch == '<')
          {
            ++numBracketsToClose;
          }
          else if (ch == '>')
          {
            --numBracketsToClose;
          }
          sb.append(ch);
          ++i;
        }
        if (in.hasNextLine())
        {
          sb.append(" ");
          line = in.nextLine();
        }
        else
        {
          line = null;
        }
      }


      if (numBracketsToClose > 0)
      {
        // There's an error.  We ran out of characters before finding the matching '>'
        return -1;
      }


      line = sb.toString();


      matchingPatterns |= checkPattern(line, patternHtmlElement, hasHtml);
      matchingPatterns |= checkPattern(line, patternPublicElement, hasPublic);
      matchingPatterns |= checkPattern(line, patternW3CElement, hasW3C);
      matchingPatterns |= checkPattern(line, patternXhtmlElement, hasXhtml);
    }
    catch (Exception e)
    {
      e.printStackTrace();
      report.message(MessageId.PKG_008, new MessageLocation(entry, -1, -1), e.getMessage());
    }
    finally
    {
      if (is != null)
      {
        try
        {
          is.close();
        }
        catch (Exception ignore)
        {
        }
      }
    }
    return matchingPatterns;
  }


  int checkPattern(String line, Pattern patternElement, int mask)
  {
    Matcher matcherElement = patternElement.matcher(line);
    return (matcherElement.find()) ? mask : 0;
  }
}
Source Code of com.adobe.epubcheck.ctc.EpubHTML5StructureCheck

Related Classes of com.adobe.epubcheck.ctc.EpubHTML5StructureCheck