Mega Code Archive

 
Categories / C# / Network
 

Sanitize any potentially dangerous tags from the provided raw HTML input using a whitelist based approach

using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.ComponentModel; namespace NearForums {   public static class Utils   {     /// <summary>     /// sanitize any potentially dangerous tags from the provided raw HTML input using      /// a whitelist based approach, leaving the "safe" HTML tags     /// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937 / http://refactormycode.com/codes/333-sanitize-html     /// </summary>     /// <param name="html">Html to sanitize</param>     /// <param name="whiteListTags">Regex containing the allowed name of the html elements. For example: em|h(2|3|4)|strong|p</param>     public static string SanitizeHtml(string html, string whiteListTags = "b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3|4)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul|a|img")     {       #region Regex definitions       Regex tagsRegex = new Regex("<[^>]*(>|$)",         RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);       Regex cleanupRegex = new Regex("((?<=<\\w+[^>]*)(?!\\shref|\\sclass|\\srel|\\stitle|\\sclass|\\swidth|\\sheight|\\salt|\\ssrc)(\\s[\\w-]+)=[\"']?((?:.(?![\"']?\\s+(?:\\S+)=|[>\"']))+.)[\"']?)|((?<=<p[^>]*)\\sclass=\"MsoNormal\")",           RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase | RegexOptions.Compiled);       Regex whitelistRegex = new Regex("^</?(" + whiteListTags + ")>$|^<(b|h)r\\s?/?>$",         RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);       Regex whitelistAnchorRegex = new Regex(@"       ^<a\s       href=""(\#\w+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""       (       (\sclass=""([\w-]+)"")|(\stitle=""[^""<>]+"")|       (\srel=""nofollow""))*       \s?>$|       ^</a>$",         RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);       Regex whitelistImageRegex = new Regex(@"       ^<img\s       src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""       ((\swidth=""\d{1,3}"")|       (\sheight=""\d{1,3}"")|       (\salt=""[^""<>]*"")|       (\stitle=""[^""<>]*""))*       \s?/?>$",         RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);       #endregion       if (String.IsNullOrEmpty(html))         return html;       //Do a previous cleanup, for not not allowed attributes included comming from word       html = cleanupRegex.Replace(html, "");       string tagname;       Match tag;       // match every HTML tag in the input       MatchCollection tags = tagsRegex.Matches(html);       for (int i = tags.Count - 1; i > -1; i--)       {         tag = tags[i];         tagname = tag.Value.ToLowerInvariant();         if (!(whitelistRegex.IsMatch(tagname) || whitelistAnchorRegex.IsMatch(tagname) || whitelistImageRegex.IsMatch(tagname)))         {           html = html.Remove(tag.Index, tag.Length);           System.Diagnostics.Debug.WriteLine("tag sanitized: " + tagname);         }       }       return html;     }   } }