Sunday, January 15, 2006

Clean Word Html (command line tool)

This command line tool is based on the code from Cleaning Word's Nasty HTML, and has been backported to .NET 1.1. To compile, download Snippet Compiler (the version for .NET 1.1). Then do File > New > Default.cs and clear the contents. Paste in the following code, then click Build > Build Current To File and call it CleanWordHtml. Open the Command Prompt at the location it was saved to and type CleanWordHtml for help.

Edit (18-Jan-06): remove u tags. Not all empty tags were removed. Does not remove empty table cells (as they may be used for column/row layout). Quoted class attributes are removed.

Edit (2-Feb-06): As a side effect of removing u tags, ul tags where also removed. So they are no longer removed. When reading in text from a file, line breaks were not read in, but now they are. You can now drag files onto the application (rather than resorting to the command line).

CleanWordHtml.cs

using System;
using System.Reflection;
using System.Collections.Specialized;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;

[assembly: AssemblyTitle("CleanWordHtml")]
[assembly: AssemblyDescription("Cleans up HTML generated by Microsoft Word")]
[assembly: AssemblyVersion("1.0.1.*")]

public class CleanUp
{
 static bool Mso = false;
 static bool IgnoreSpans = false;
 static bool IgnoreDivs = false;
 
 static void Main(string[] args)
 {
  string help = "Cleans up HTML generated by Microsoft Word" + Environment.NewLine + Environment.NewLine
   + "Usage:" + Environment.NewLine
   + "------" + Environment.NewLine
   + "CleanWordHtml \"path to file\"" + Environment.NewLine
   + "CleanWordHtml -path \"path to file\"" + Environment.NewLine + Environment.NewLine
   + "  Other options:" + Environment.NewLine
   + "   -mso (remove only classes generated by word)" + Environment.NewLine
   + "   -ignorespans (don't remove span tags)" + Environment.NewLine
   + "   -ignoredivs (don't remove div tags)";
  
  string filepath = string.Empty;
  if ((args.Length == 0 || IsNullOrEmpty(args[0]))) 
  {
   Console.WriteLine(help);
   return;
  }
  
  if(args.Length == 1)
  {
   filepath = args[0];
  }
  else
  {
   for(int i=0; i<args.Length; i++)
   {
    if(args[i].ToLower() == "-path")
    {
     filepath = args[i+1];
    }
    if(args[i].ToLower() == "-mso")
    {
     Mso = true;
    }
    if(args[i].ToLower() == "-ignorespans")
    {
     IgnoreSpans = true;
    }
    if(args[i].ToLower() == "-ignoredivs")
    {
     IgnoreDivs = true;
    }
   }
  }
  if(IsNullOrEmpty(filepath))
  {
   Console.WriteLine(help);
   return;
  }
  if (Path.GetFileName(filepath) == filepath)
  {
   filepath = Path.Combine(Environment.CurrentDirectory, filepath);
  }
  if (!File.Exists(filepath))
  {
   Console.WriteLine("File '" + filepath + "' doesn't exist.");
   return;
  }
  string html = ReadAllText(filepath);
  Console.WriteLine("Input html is " + html.Length + " chars");
  html = CleanWordHtml(html);
  html = FixEntities(html);
  filepath = Path.Combine(Path.GetDirectoryName(filepath), Path.GetFileNameWithoutExtension(filepath) + ".modified" + Path.GetExtension(filepath));
  WriteAllText(filepath, html);
  Console.WriteLine("Cleaned html is " + html.Length + " chars. Saved to " + filepath);
 }
 
 static string CleanWordHtml(string html)
 {
  StringCollection sc = new StringCollection();
  if(!IgnoreSpans)
  {
   sc.Add(@"<(/?span|!\[)[^>]*?>");
  }
  if(!IgnoreDivs)
  {
   sc.Add(@"<(/?div|!\[)[^>]*?>");
  }
  if(!Mso)
  {
   // Get rid of classes
   sc.Add(@"\s?class=[""']?\w+[""']?");
  }
  else
  {
   // Get rid of office classes
   sc.Add(@"\s?class=[""']?Mso\w+[""']?");
  }
  // get rid of unnecessary tag spans (comments and title)
  sc.Add(@"<!--(\w|\W)+?-->");
  sc.Add(@"<title>(\w|\W)+?</title>");
  // get rid of inline style
  sc.Add(@"\s?style=[""']?\w+[""']?");
  // Get rid of unnecessary tags
  sc.Add(@"<(meta|link|/?o:|/?style|/?font|/?st\d|/?head|/?html|body|/?body|!\[)[^>]*?>");
  // Get rid of empty tags (except table cells)
  sc.Add(@"(<[^/][^(th|d)>]*>){1}(&nbsp;)*(</[^>]+>){1}");
  // remove bizarre v: element attached to <img> tag
  sc.Add(@"\s+v:\w+=""[^""]+""");
  // remove extra lines
  sc.Add(@"(" + Environment.NewLine + "){2,}");
  // remove extra spaces
  sc.Add(@"( ){2,}");
  foreach (string s in sc)
  {
   html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
  }
  // quote unquoted attributes
  //html = Regex.Replace(html, @"(\w+=)(\w+)(?=[ >])", @"$1""$2""", RegexOptions.IgnoreCase);
  return html;
 }
 
 static string FixEntities(string html)
 {
  NameValueCollection nvc = new NameValueCollection();
  nvc.Add("“", "&ldquo;");
  nvc.Add("”", "&rdquo;");
  nvc.Add("—", "&mdash;");
  foreach (string key in nvc.Keys)
  {
   html = html.Replace(key, nvc[key]);
  }
  return html;
 }
 
 static bool IsNullOrEmpty(string value)
 {
  if (value != null)
  {
   return (value.Length == 0);
  }
  return true;
 }

 static string ReadAllText(string path)
 {
  StringBuilder sb = new StringBuilder();
  using (StreamReader sr = new StreamReader(path)) 
  {
   String line;
   // Read and display lines from the file until the end of 
   // the file is reached.
   while ((line = sr.ReadLine()) != null) 
   {
    sb.Append(line + Environment.NewLine);
   }
  }
  return sb.ToString();
 }
 
 static void WriteAllText(string path, string contents)
 {
  WriteAllText(path, contents, new UTF8Encoding(false, true));
 }
 
 static void WriteAllText(string path, string contents, Encoding encoding)
 {
  using (StreamWriter sw = new StreamWriter(path, false, encoding))
  {
   sw.Write(contents);
  }
 }
}