I work with a couple of customers that need to validate files before they are opened. Their solutions require them to be able to take specific actions if the files are password protected or corrupt before they are opened, such as not opening them. Waiting for Word to error out, prompt for a password or fail to open the file can be too late in the process…
The following code has not really been tested extensively (especially with regard to performance). It has been tested on a small subset of test files and worked. However, it may need some additional field testing.
The way it works is to create a new instance of the OfficeFileValidator Class and then Validate the file. Comments on how this is done are inline with the code…
OpenFileDialog LobjOfd = new OpenFileDialog(); if (LobjOfd.ShowDialog() == System.Windows.Forms.DialogResult.OK) { OfficeFileValidator LobjValidator = new OfficeFileValidator(LobjOfd.FileName); switch(LobjValidator.Validate()) { case OfficeFileValidator.ValidationResult.VR_CORRUPT: MessageBox.Show("This file appears to be corrupt: \n\n" + "The part that failed is: " + LobjValidator.GetExceptionInfo.FailedPartUri + "\n\n" + "The reason: " + LobjValidator.GetExceptionInfo.FailedReason); break; case OfficeFileValidator.ValidationResult.VR_VALID: MessageBox.Show("The is a valid Office file."); break; case OfficeFileValidator.ValidationResult.VR_ENCRYPTED: MessageBox.Show("This is an encrypted Office file."); break; case OfficeFileValidator.ValidationResult.VR_UNKNOWN: MessageBox.Show("This does not appear to be a valid Office file: \n\n\t" + LobjValidator.GetExceptionInfo.FailedReason); break; case OfficeFileValidator.ValidationResult.VR_BADEXT: MessageBox.Show("This is not an Office file."); break; } }
Here is the class:
/// <summary> /// OFFICE FILE VALIDATOR CLASS /// /// This class is used to determine if any given file: /// 1) Is a valid Office file with all parts present /// 2) Is an encrypted/password protected Office file /// 3) Is an corrupted file /// /// In the case of corruption additional information /// can be found in the GetExceptionInfo property /// </summary> public class OfficeFileValidator { private const int CiBufferSize = 4096; public enum ValidationResult { VR_BADEXT, VR_UNKNOWN, VR_VALID, VR_ENCRYPTED, VR_CORRUPT }; private const string CstrEncryptionSchemaUri = "http://schemas.microsoft.com/office/2006/keyEncryptor/password"; private string MstrFilename; private ValidatorException MobjException = new ValidatorException("N/A"); private string[] MobjValidExtensions = new string[] { "pptx", "pptm", "potx", "potm", "docx", "docm", "dotx", "dotm", "xlsx", "xlsm", "xltx", "xltm", "xlsb" }; /// <summary> /// CTOR - takes a full path and filename /// </summary> /// <param name="PstrFilename"></param> public OfficeFileValidator(string PstrFilename) { MstrFilename = PstrFilename; } /// <summary> /// READ-ONLY PROPERTY /// Return the Exception that occurred during validation /// if the result returned is VR_CORRPUT or VR_UNKNOWN /// </summary> public ValidatorException GetExceptionInfo { get { return MobjException; } } /// <summary> /// Validate the office file and returns the result of: /// VR_VALID - if it is a standard, validated Office file /// VR_ENCRYPTED - if it is an encrypted file /// VR_CORRUPT - if there is something wrong with the structure of the file /// VR_UNKNONW - if it is not a valid Office file. /// </summary> /// <returns></returns> public ValidationResult Validate() { try { // first start off validating the file extension string LstrExt = new FileInfo(MstrFilename).Extension.ToLower().Replace(".", ""); if (!MobjValidExtensions.Contains(LstrExt)) { MobjException = new ValidatorException("The file extension is not valid."); return ValidationResult.VR_BADEXT; } // next start off by reading the first two bytes of the file char[] LobjBuffer = new char[2]; StreamReader LobjSr = new StreamReader(MstrFilename); LobjSr.Read(LobjBuffer, 0, 2); LobjSr.Close(); // if the first two bytes start with PK (for package), then // we know we have a valid Zip Package if (string.Concat(LobjBuffer).ToUpper() == "PK") { // now make sure all of its parts are there if (isStructureValid(MstrFilename)) return ValidationResult.VR_VALID; // default else return ValidationResult.VR_CORRUPT; } else { // if the first two characters are not PK, then // we will look through the stream to see if we // can find the Encrypted URI LobjSr = new StreamReader(MstrFilename); LobjBuffer = new char[CiBufferSize]; // seek back minus 4kb in the file LobjSr.BaseStream.Seek(LobjSr.BaseStream.Length - CiBufferSize, SeekOrigin.Current); LobjSr.Read(LobjBuffer, 0, CiBufferSize); if (string.Concat(LobjBuffer).Contains(CstrEncryptionSchemaUri)) { // Yes - we are encrypted LobjSr.Close(); return ValidationResult.VR_ENCRYPTED; // encrypted } // if we made it here - we are not encrypted LobjSr.Close(); // The problem here is that the file might be completely corrupt // a binary file renamed with an OpenXml extension or it is // another file type named as an OpenXml extension - invalid MobjException = new ValidatorException("File content is not recognized."); return ValidationResult.VR_UNKNOWN; // something else - maybe not an office file } } catch (Exception ex) { // somewhere above an exception occurred - we are not sure // what happened, so just return the exception text MobjException = new ValidatorException(ex.Message); return ValidationResult.VR_UNKNOWN; // something failed - corrupt? } } /// <summary> /// HELPER METHOD /// This method opens the file as a Zip Package and then /// validates that: /// 1) All parts are present /// 2) All parts can load a stream /// 3) That all XML parts are valid XML /// /// Returns true if it is valid. /// </summary> /// <param name="PobjFilename"></param> /// <returns></returns> private bool isStructureValid(string PobjFilename) { ZipPackage LobjZip = null; string lastPartUri = ""; try { // open the package in a ZIP PACKAGER LobjZip = (ZipPackage)Package.Open(PobjFilename, FileMode.Open, FileAccess.Read); // loop through all the parts foreach (PackagePart LobjPart in LobjZip.GetParts()) { // log the uri of this part lastPartUri = LobjPart.Uri.OriginalString.ToLower(); // grab the stream for the part Stream LobjPartStream = LobjPart.GetStream(); // is the part an XML component? if (LobjPart.Uri.OriginalString.ToLower().EndsWith(".xml")) { // load it into an XML Doc and verify that it loads XmlDocument LobjPartDoc = new XmlDocument(); LobjPartDoc.Load(LobjPartStream); LobjPartStream.Close(); if (LobjPartDoc.DocumentElement.OuterXml.Length == 0) { MobjException = new ValidatorException("Part not valid [empty].", lastPartUri); LobjZip.Close(); return false; } } else { // we have a binary part StreamReader LobjSr = new StreamReader(LobjPartStream); LobjSr.ReadToEnd(); // just try to reach to the end LobjSr.Close(); } } // here and ok LobjZip.Close(); return true; } catch (Exception ex) { // something bad happened above - unknown - just return the exception MobjException = new ValidatorException(ex.Message, lastPartUri); LobjZip.Close(); return false; } } /// <summary> /// INTERNAL EXCEPTION CLASS /// This is the exception class that is used to determine what the /// failure was in the application /// </summary> public class ValidatorException { public string FailedPartUri { get; set; } public string FailedReason { get; set; } public ValidatorException(string PstrReason, string PstrLastUri) { FailedPartUri = PstrLastUri; FailedReason = PstrReason; } public ValidatorException(string PstrReason) { FailedPartUri = ""; FailedReason = PstrReason; } } }