Office File Validation Class

I work with a couple of customers that need to validate files before they are opened. Their solutions require them to be able to take specific actions if the files are password protected or corrupt before they are opened, such as not opening them. Waiting for Word to error out, prompt for a password or fail to open the file can be too late in the process…

The following code has not really been tested extensively (especially with regard to performance). It has been tested on a small subset of test files and worked. However, it may need some additional field testing.

The way it works is to create a new instance of the OfficeFileValidator Class and then Validate the file. Comments on how this is done are inline with the code…

OpenFileDialog LobjOfd = new OpenFileDialog();
if (LobjOfd.ShowDialog() == System.Windows.Forms.DialogResult.OK)
{
    OfficeFileValidator LobjValidator = new OfficeFileValidator(LobjOfd.FileName);
    switch(LobjValidator.Validate())
    {
        case OfficeFileValidator.ValidationResult.VR_CORRUPT:
            MessageBox.Show("This file appears to be corrupt: \n\n" +
                            "The part that failed is: " + LobjValidator.GetExceptionInfo.FailedPartUri + "\n\n" +
                            "The reason: " + LobjValidator.GetExceptionInfo.FailedReason);
            break;
        case OfficeFileValidator.ValidationResult.VR_VALID:
            MessageBox.Show("The is a valid Office file.");
            break;
        case OfficeFileValidator.ValidationResult.VR_ENCRYPTED:
            MessageBox.Show("This is an encrypted Office file.");
            break;
        case OfficeFileValidator.ValidationResult.VR_UNKNOWN:
            MessageBox.Show("This does not appear to be a valid Office file: \n\n\t" + LobjValidator.GetExceptionInfo.FailedReason);
            break;
        case OfficeFileValidator.ValidationResult.VR_BADEXT:
            MessageBox.Show("This is not an Office file.");
            break;
    }
}

Here is the class:

/// <summary>
/// OFFICE FILE VALIDATOR CLASS
/// 
/// This class is used to determine if any given file:
/// 1) Is a valid Office file with all parts present
/// 2) Is an encrypted/password protected Office file
/// 3) Is an corrupted file
/// 
/// In the case of corruption additional information
/// can be found in the GetExceptionInfo property
/// </summary>
public class OfficeFileValidator
{
    private const int CiBufferSize = 4096;
    public enum ValidationResult { VR_BADEXT, VR_UNKNOWN, VR_VALID, VR_ENCRYPTED, VR_CORRUPT };
    private const string CstrEncryptionSchemaUri = "http://schemas.microsoft.com/office/2006/keyEncryptor/password";
    private string MstrFilename;
    private ValidatorException MobjException = new ValidatorException("N/A");
    private string[] MobjValidExtensions = new string[] { "pptx", "pptm", "potx", "potm", 
                                                          "docx", "docm", "dotx", "dotm", 
                                                          "xlsx", "xlsm", "xltx", "xltm", "xlsb" };

    /// <summary>
    /// CTOR - takes a full path and filename
    /// </summary>
    /// <param name="PstrFilename"></param>
    public OfficeFileValidator(string PstrFilename)
    {
        MstrFilename = PstrFilename;
    }

    /// <summary>
    /// READ-ONLY PROPERTY
    /// Return the Exception that occurred during validation
    /// if the result returned is VR_CORRPUT or VR_UNKNOWN
    /// </summary>
    public ValidatorException GetExceptionInfo
    {
        get
        {
            return MobjException;
        }
    }

    /// <summary>
    /// Validate the office file and returns the result of:
    /// VR_VALID - if it is a standard, validated Office file
    /// VR_ENCRYPTED - if it is an encrypted file
    /// VR_CORRUPT - if there is something wrong with the structure of the file
    /// VR_UNKNONW - if it is not a valid Office file.
    /// </summary>
    /// <returns></returns>
    public ValidationResult Validate()
    {
        try
        {
            // first start off validating the file extension
            string LstrExt = new FileInfo(MstrFilename).Extension.ToLower().Replace(".", "");
            if (!MobjValidExtensions.Contains(LstrExt))
            {
                MobjException = new ValidatorException("The file extension is not valid.");
                return ValidationResult.VR_BADEXT;
            }

            // next start off by reading the first two bytes of the file
            char[] LobjBuffer = new char[2];
            StreamReader LobjSr = new StreamReader(MstrFilename);
            LobjSr.Read(LobjBuffer, 0, 2);
            LobjSr.Close();
            // if the first two bytes start with PK (for package), then
            // we know we have a valid Zip Package
            if (string.Concat(LobjBuffer).ToUpper() == "PK")
            {
                // now make sure all of its parts are there
                if (isStructureValid(MstrFilename))
                    return ValidationResult.VR_VALID; // default
                else
                    return ValidationResult.VR_CORRUPT;
            }
            else
            {
                // if the first two characters are not PK, then
                // we will look through the stream to see if we
                // can find the Encrypted URI

                LobjSr = new StreamReader(MstrFilename);
                LobjBuffer = new char[CiBufferSize];
                // seek back minus 4kb in the file
                LobjSr.BaseStream.Seek(LobjSr.BaseStream.Length - CiBufferSize, SeekOrigin.Current);
                LobjSr.Read(LobjBuffer, 0, CiBufferSize);
                if (string.Concat(LobjBuffer).Contains(CstrEncryptionSchemaUri))
                {
                    // Yes - we are encrypted
                    LobjSr.Close();
                    return ValidationResult.VR_ENCRYPTED; // encrypted
                }
                // if we made it here - we are not encrypted
                LobjSr.Close();
                // The problem here is that the file might be completely corrupt
                // a binary file renamed with an OpenXml extension or it is
                // another file type named as an OpenXml extension - invalid
                MobjException = new ValidatorException("File content is not recognized.");
                return ValidationResult.VR_UNKNOWN; // something else - maybe not an office file
            }
        }
        catch (Exception ex)
        {
            // somewhere above an exception occurred - we are not sure
            // what happened, so just return the exception text
            MobjException = new ValidatorException(ex.Message);
            return ValidationResult.VR_UNKNOWN; // something failed - corrupt?
        }
    }

    /// <summary>
    /// HELPER METHOD
    /// This method opens the file as a Zip Package and then
    /// validates that:
    /// 1) All parts are present
    /// 2) All parts can load a stream
    /// 3) That all XML parts are valid XML
    /// 
    /// Returns true if it is valid.
    /// </summary>
    /// <param name="PobjFilename"></param>
    /// <returns></returns>
    private bool isStructureValid(string PobjFilename)
    {
        ZipPackage LobjZip = null;
        string lastPartUri = "";
        try
        {
            // open the package in a ZIP PACKAGER
            LobjZip = (ZipPackage)Package.Open(PobjFilename, FileMode.Open, FileAccess.Read);
            // loop through all the parts
            foreach (PackagePart LobjPart in LobjZip.GetParts())
            {
                // log the uri of this part
                lastPartUri = LobjPart.Uri.OriginalString.ToLower();
                // grab the stream for the part
                Stream LobjPartStream = LobjPart.GetStream();
                // is the part an XML component?
                if (LobjPart.Uri.OriginalString.ToLower().EndsWith(".xml"))
                {
                    // load it into an XML Doc and verify that it loads
                    XmlDocument LobjPartDoc = new XmlDocument();
                    LobjPartDoc.Load(LobjPartStream);
                    LobjPartStream.Close();
                    if (LobjPartDoc.DocumentElement.OuterXml.Length == 0)
                    {
                        MobjException = new ValidatorException("Part not valid [empty].", lastPartUri);
                        LobjZip.Close();
                        return false;
                    }
                }
                else
                {
                    // we have a binary part
                    StreamReader LobjSr = new StreamReader(LobjPartStream);
                    LobjSr.ReadToEnd(); // just try to reach to the end
                    LobjSr.Close();
                }
            }
            // here and ok
            LobjZip.Close();
            return true;
        }
        catch (Exception ex)
        {
            // something bad happened above - unknown - just return the exception
            MobjException = new ValidatorException(ex.Message, lastPartUri);
            LobjZip.Close();
            return false;
        }
    }

    /// <summary>
    /// INTERNAL EXCEPTION CLASS
    /// This is the exception class that is used to determine what the
    /// failure was in the application
    /// </summary>
    public class ValidatorException
    {
        public string FailedPartUri { get; set; }
        public string FailedReason { get; set; }
        public ValidatorException(string PstrReason, string PstrLastUri)
        {
            FailedPartUri = PstrLastUri;
            FailedReason = PstrReason;
        }
        public ValidatorException(string PstrReason)
        {
            FailedPartUri = "";
            FailedReason = PstrReason;
        }

    }
}

Leave a Reply