Mega Code Archive

Sanitize any potentially dangerous tags from the provided raw HTML input using a whitelist based approach

using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.ComponentModel; namespace NearForums { public static class Utils { /// <summary> /// sanitize any potentially dangerous tags from the provided raw HTML input using /// a whitelist based approach, leaving the "safe" HTML tags /// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937 / http://refactormycode.com/codes/333-sanitize-html /// </summary> /// <param name="html">Html to sanitize</param> /// <param name="whiteListTags">Regex containing the allowed name of the html elements. For example: em|h(2|3|4)|strong|p</param> public static string SanitizeHtml(string html, string whiteListTags = "b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3|4)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul|a|img") { #region Regex definitions Regex tagsRegex = new Regex("<[^>]*(>|$)", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled); Regex cleanupRegex = new Regex("((?<=<\\w+[^>]*)(?!\\shref|\\sclass|\\srel|\\stitle|\\sclass|\\swidth|\\sheight|\\salt|\\ssrc)(\\s[\\w-]+)=[\"']?((?:.(?![\"']?\\s+(?:\\S+)=|[>\"']))+.)[\"']?)|((?<=<p[^>]*)\\sclass=\"MsoNormal\")", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase | RegexOptions.Compiled); Regex whitelistRegex = new Regex("^</?(" + whiteListTags + ")>$|^<(b|h)r\\s?/?>$", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace); Regex whitelistAnchorRegex = new Regex(@" ^<a\s href=""(\#\w+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;]+)"" ( (\sclass=""([\w-]+)"")|(\stitle=""[^""<>]+"")| (\srel=""nofollow""))* \s?>$| ^</a>$", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace); Regex whitelistImageRegex = new Regex(@" ^<img\s src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;]+"" ((\swidth=""\d{1,3}"")| (\sheight=""\d{1,3}"")| (\salt=""[^""<>]*"")| (\stitle=""[^""<>]*""))* \s?/?>$", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace); #endregion if (String.IsNullOrEmpty(html)) return html; //Do a previous cleanup, for not not allowed attributes included comming from word html = cleanupRegex.Replace(html, ""); string tagname; Match tag; // match every HTML tag in the input MatchCollection tags = tagsRegex.Matches(html); for (int i = tags.Count - 1; i > -1; i--) { tag = tags[i]; tagname = tag.Value.ToLowerInvariant(); if (!(whitelistRegex.IsMatch(tagname) || whitelistAnchorRegex.IsMatch(tagname) || whitelistImageRegex.IsMatch(tagname))) { html = html.Remove(tag.Index, tag.Length); System.Diagnostics.Debug.WriteLine("tag sanitized: " + tagname); } } return html; } } }