I found an article “C# Validate XHTML” with source code and decided to use it as a start point for my Html Fragment validation. Unfortunately there are quite a few things in the original code, that didn’t work as I expected/wanted, so I had to spend much more time to change it that I originally thought. Thanks to Sam Allen for very responsive answers.
#region Summary
///////////////////////////////////////////////////////////////////////////////
/*/ $History: * Sam Allen http://www.dotnetperls.com/xhtml Created * Michael Freidgeim http://geekswithblogs.net/mnf/ 31-May-2011 Enhanced /*/
///////////////////////////////////////////////////////////////////////////////
#endregion
#region Namespace Imports
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
#endregion
/// <summary>
/// Copied and modified from http://www.dotnetperls.com/xhtml
/// </summary>
public class HtmlValidator
{
#region Enums
[Flags]
protected enum FeaturesToValidate
{
// "Quoted" attributes: The parser understands double-quoted attributes, and will flag errors if an attribute isn't quoted. Its support isn't elaborate or precise, but these errors will be noted.
QuotedAttributes=1,
//Invalid characters: Valid HTML does not allow the ampersand (&) in the source code unless it is part of an entity. This is a very common problem with lists of people's names or with programming sites.
AmpersandInCode = 2,
Tab=64,
//Lowercase markup: The parser demands that all markup be lowercased. (Lowercase markup compresses better, and good style usually requires lowercase tags.)
LowercaseMarkup=4,
//Solo tags: In XHTML, some tags (such as img and br) must be closed in the same tag as they are declared. The parser I developed enforces this. It will flag errors where these tags are not closed.
SoloTags=8,
//Nested tags: HTML markup must be nested correctly. Child elements must be closed before their parent tags. (My parser doesn't deal with complicated inline elements, such as when a overlaps with i.)
NestedTags=16,
CloseTags=32,
}
[Flags]
private enum TagType
{
Unknown=0,
CloseTag=1,
SoloTag=2
}
#endregion //Enums
#region Properties
private Dictionary<string, bool> _soloTags = null;
#endregion //Properties
#region Constructors
public HtmlValidator()
{
_soloTags=new Dictionary<string, bool>(s_defaultSoloTags);
}
#endregion //Constructors
#region Static helpers
/// <summary>
/// http://www.w3.org/People/Raggett/tidy/
/// </summary>
/// <param name="path"></param>
/// <param name="pattern"></param>
/// <param name="tidyOptions">possible options are listed at http://manpages.ubuntu.com/manpages/dapper/man1/tidy.1.html
/// Default is
/// -modify or -m modify the original input files
/// -indentor -i indent element content
/// -bare or -b strip out smart quotes and em dashes, etc.
/// -quiet or -q suppress nonessential output
/// -wrap <column> wrap text at the specified <column> (default is 68)
///</param>
/// <returns></returns>
public static string GenerateTidyBatch(string path, string pattern = "*.htm?", string tidyPath = "", string tidyOptions = "-m -i -b -q -wrap 150", string fileToSave = "")
{
string batchContent="";
string[] dirs = Directory.GetFiles(path,pattern);
Debug.WriteLine("The number of files in path {0} {1} is {2}.", path, pattern, dirs.Length);
foreach (string fileName in dirs)
{
string sHtml = StreamHelper.FileToString(fileName);
if (!String.IsNullOrEmpty(tidyPath))
{
tidyPath = StringHelper.EnsureEndsWith(tidyPath,@"\");
}
batchContent += "\"" + tidyPath + "tidy\" " + tidyOptions+ " \"" + fileName +"\" 2>>errs.txt"+Environment.NewLine;
}
if (!String.IsNullOrEmpty(fileToSave))
{
string fileContent = "REM the file has been generated by " + MethodBase.GetCurrentMethod().DeclaringType.FullName + Environment.NewLine + batchContent;//DeclaringType.
StreamHelper.SaveStringToFile(fileContent, fileToSave);
}
return batchContent;
}
#endregion //Static helpers
/// <summary>
///
/// </summary>
/// <param name="path"></param>
/// <param name="pattern"></param>
/// <returns></returns>
public string ValidateHtmlFiles(string path,string pattern="*.htm?")
{
string sMessage="";
string[] dirs = Directory.GetFiles(path,pattern);
Debug.WriteLine("The number of files in path {0} {1} is {2}.", path, pattern, dirs.Length);
foreach (string fileName in dirs)
{
string sHtml = StreamHelper.FileToString(fileName);
string sErr = CheckHtmlStructure(sHtml);
if(!string.IsNullOrEmpty(sErr))
{
sMessage += Environment.NewLine + "file " + fileName + " has errors " + Environment.NewLine + sErr;
}
}
return sMessage;
}
public string CheckHtmlStructure(string sHtml)
{
return CheckHtml(sHtml, FeaturesToValidate.CloseTags | FeaturesToValidate.NestedTags);
}
/// <summary>
/// Whether the HTML is likely valid. Error parameter will be empty
/// if no errors were found.
/// </summary>
private string CheckHtml(string html,FeaturesToValidate flagsToValidate)
{
//
// Store our tags in a stack
//
Stack<string> tags = new Stack<string>();
//
// Initialize return to empty
//
string error = string.Empty;
//
// Count of parenthesis
//
int parenthesisR = 0;
int parenthesisL = 0;
//
// Traverse entire HTML
//
for (int i = 0; i < html.Length; i++)
{
char c = html[i];
if (c == '<')
{
TagType tagType;
//
// Look ahead at this tag
//
string tag = LookAheadForTagName(html, i, out tagType);
bool isClose=tagType.HasFlag(TagType.CloseTag);
bool isSolo = tagType.HasFlag(TagType.SoloTag);
//
// Make sure tag is lowercase
//
if (tag.ToLower() != tag)
{
if (flagsToValidate.HasFlag(FeaturesToValidate.LowercaseMarkup))
{
error += "upper: " + tag;
//try to continue? return error;
}
}
//
// Make sure solo tags are parsed as solo tags
//
//some solos not required closing slash, e.g. !DOCTYPE
if (_soloTags.ContainsKey(tag) && _soloTags[tag] == true)
{
if (!isSolo)
{
if (flagsToValidate.HasFlag(FeaturesToValidate.SoloTags))
{
error += "!solo: " + tag;
//try to continue return;
}
}
}
else
{
//
// We are on a regular end or start tag
//
if (isClose)
{
//
// We can't close a tag that isn't on the stack
//
if (tags.Count == 0)
{
if (flagsToValidate.HasFlag(FeaturesToValidate.CloseTags))
{
error += "!closing: " + tag;
//try to continue return;
}
}
//
// Tag on stack must be equal to this closing tag
//
if (tags.Peek() == tag)
{
//
// Remove the start tag from the stack
//
tags.Pop();
}
else
{
//
// Mismatched closing tag
//
if (flagsToValidate.HasFlag(FeaturesToValidate.CloseTags))
{
error += "!match: " + tag;// +Environment.NewLine;
//try to continue return;
}
}
}
else
{
if (_soloTags.ContainsKey(tag) && _soloTags[tag] == false)
{ //ignore
}
else
{//
// Add tag to stack
//
tags.Push(tag);
}
}
}
i += tag.Length;
}
else if (c == '&')
{
//
// & must never be followed by space or other &
//
if ((i + 1) < html.Length)
{
char next = html[i + 1];
if (char.IsWhiteSpace(next) || next == '&')
{
if (flagsToValidate.HasFlag(FeaturesToValidate.AmpersandInCode))
{
error += "ampersand ";
//try to continue return;
} //error = "ampersand";
}
}
}
else if (c == '\t')
{
if (flagsToValidate.HasFlag(FeaturesToValidate.Tab))
{
error += "tab";
//try to continue return;
}
}
else if (c == '(')
{
parenthesisL++;
}
else if (c == ')')
{
parenthesisR++;
}
}
if (flagsToValidate.HasFlag(FeaturesToValidate.CloseTags))
{
//
// If we have tags in the stack, write them to error
//
foreach (string tagName in tags)
{
if (tagName!="!DOCTYPE")//allowed to be not closed
error += "extra:" + tagName + " ";
}
//
// Require even number of parenthesis
//
if (parenthesisL != parenthesisR)
{
error += "!even parenthesisList";
}
}
return error;
}
//Look ahead method. The method shown above will call the following method, which deals with the tags themselves. This method deals with 'solo' or self-closing tags. It does this by recording the position of the slash. It also handles quotes. After the method there is a static Dictionary that is used to tell if a tag is a solo tag.
/// <summary>
/// Called at the start of an html tag. We look forward and record information
/// about our tag. Handles start tags, close tags, and solo tags. 'Collects'
/// an entire tag.
/// </summary>
/// <returns>Tag name.</returns>
static private string LookAheadForTagName(string html, int start, out TagType tagType)
// bool isClose,out bool isSolo)
{
tagType=TagType.Unknown;
StringBuilder tagName = new StringBuilder();
//
// Stores the position of the final slash
//
int slashPos = -1;
//
// Whether we have encountered a space
//
bool space = false;
//
// Whether we are in a quote
//
bool quote = false;
//
// Begin scanning the tag
//
int i;
for (i = 0; ; i++)
{
//
// Get the position in main html
//
int pos = start + i;
//
// Don't go outside the html
//
if (pos >= html.Length)
{
return "x";//Why ?
}
//
// The character we are looking at
//
char c = html[pos];
//
// See if a space has been encountered
//
if (char.IsWhiteSpace(c))
{
space = true;
}
//
// Add to our tag name if none of these are present
//
if (space == false &&
c != '<' &&
c != '>' &&
c != '/')
{
tagName.Append(c);
}
//
// Record position of slash if not inside a quoted area
//
if (c == '/' &&
quote == false)
{
slashPos = i;
}
//
// End at the > bracket
//
if (c == '>')
{
break;
}
//
// Record whether we are in a quoted area
//
if (c == '\"')
{
quote = !quote;
}
}
//
// Determine if this is a solo or closing tag
//
if (slashPos != -1)
{
//
// If slash is at the end so this is solo
//
if (slashPos + 1 == i)
{
tagType|= TagType.SoloTag;// isSolo = true;
}
else
{
if (slashPos == 1)// it is the beginning
{ tagType |= TagType.CloseTag; }
else
{
Debug.Assert(false, "unexpected in the middle of element");
}
}
}
//
// Return the name of the tag collected
//
string name = tagName.ToString();
//if (name.Length == 0)
//{
//return "empty";
//}
//else
//{
return name;
//}
}
/// <summary>
/// Tags that must be closed in the start
/// boolean -is closing slash required
/// </summary>
static Dictionary<string, bool> s_defaultSoloTags = new Dictionary<string, bool>()
{
{"img", true},
{"br", true},
{"meta",true},
{"!DOCTYPE",false},//todo: use as special TagType the DOCTYPE. Like at the start at every document just check it exists
//and then start scanning after it. I don't think it's worth it trying to treat it as a solo tag.
{"!--",false}//todo:use as special TagType
};
}