Html Strip Tags
using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace HtmlStripTags
{
class HtmlHelper
{
private static readonly string[][] htmlNamedEntities = new string[][] {
new string[] { """, "\"" },
new string[] { "<", "<" },
new string[] { ">", ">" },
new string[] { " ", " " },
new string[] { "¡", "¡" },
new string[] { "¢", "¢" },
new string[] { "£", "£" },
new string[] { "¤", "¤" },
new string[] { "¥", "¥" },
new string[] { "¦", "¦" },
new string[] { "§", "§" },
new string[] { "¨", "¨" },
new string[] { "©", "©" },
new string[] { "ª", "ª" },
new string[] { "«", "«" },
new string[] { "¬", "¬" },
new string[] { "­", "" },
new string[] { "®", "®" },
new string[] { "¯", "¯" },
new string[] { "°", "°" },
new string[] { "±", "±" },
new string[] { "²", "²" },
new string[] { "³", "³" },
new string[] { "´", "´" },
new string[] { "µ", "µ" },
new string[] { "¶", "¶" },
new string[] { "·", "·" },
new string[] { "¸", "¸" },
new string[] { "¹", "¹" },
new string[] { "º", "º" },
new string[] { "»", " »" },
new string[] { "¼", "¼" },
new string[] { "½", "½" },
new string[] { "¾", "¾" },
new string[] { "¿", "¿" },
new string[] { "À", "À" },
new string[] { "Á", "Á" },
new string[] { "Â", "Â" },
new string[] { "Ã", "Ã" },
new string[] { "Ä", "Ä" },
new string[] { "Å", "Å" },
new string[] { "Æ", "Æ" },
new string[] { "Ç", "Ç" },
new string[] { "È", "È" },
new string[] { "É", "É" },
new string[] { "Ê", "Ê" },
new string[] { "Ë", "Ë" },
new string[] { "Ì", "Ì" },
new string[] { "Í", "Í" },
new string[] { "Î", "Î" },
new string[] { "Ï", "Ï" },
new string[] { "Ð", "Ð" },
new string[] { "Ñ", "Ñ" },
new string[] { "Ò", "Ò" },
new string[] { "Ó", "Ó" },
new string[] { "Ô", "Ô" },
new string[] { "Õ", "Õ" },
new string[] { "Ö", "Ö" },
new string[] { "×", "×" },
new string[] { "Ø", "Ø" },
new string[] { "Ù", "Ù" },
new string[] { "Ú", "Ú" },
new string[] { "Û", "Û" },
new string[] { "Ü", "Ü" },
new string[] { "Ý", "Ý" },
new string[] { "Þ", "Þ" },
new string[] { "ß", "ß" },
new string[] { "à", "à" },
new string[] { "á", "á" },
new string[] { "â", "â" },
new string[] { "ã", "ã" },
new string[] { "ä", "ä" },
new string[] { "å", "å" },
new string[] { "æ", "æ" },
new string[] { "ç", "ç" },
new string[] { "è", "è" },
new string[] { "é", "é" },
new string[] { "ê", "ê" },
new string[] { "ë", "ë" },
new string[] { "ì", "ì" },
new string[] { "í", "í" },
new string[] { "î", "î" },
new string[] { "ï", "ï" },
new string[] { "ð", "ð" },
new string[] { "ñ", "ñ" },
new string[] { "ò", "ò" },
new string[] { "ó", "ó" },
new string[] { "ô", "ô" },
new string[] { "õ", "õ" },
new string[] { "ö", "ö" },
new string[] { "÷", "÷" },
new string[] { "ø", "ø" },
new string[] { "ù", "ù" },
new string[] { "ú", "ú" },
new string[] { "û", "û" },
new string[] { "ü", "ü" },
new string[] { "ý", "ý" },
new string[] { "þ", "þ" },
new string[] { "ÿ", "ÿ" },
new string[] { "Œ", "Œ" },
new string[] { "œ", "œ" },
new string[] { "Š", "Š" },
new string[] { "š", "š" },
new string[] { "Ÿ", "Ÿ" },
new string[] { "ƒ", "ƒ" },
new string[] { "ˆ", "ˆ" },
new string[] { "˜", "˜" },
new string[] { "Α", "Α" },
new string[] { "Β", "Β" },
new string[] { "Γ", "Γ" },
new string[] { "Δ", "Δ" },
new string[] { "Ε", "Ε" },
new string[] { "Ζ", "Ζ" },
new string[] { "Η", "Η" },
new string[] { "Θ", "Θ" },
new string[] { "Ι", "Ι" },
new string[] { "Κ", "Κ" },
new string[] { "Λ", "Λ" },
new string[] { "Μ", "Μ" },
new string[] { "Ν", "Ν" },
new string[] { "Ξ", "Ξ" },
new string[] { "Ο", "Ο" },
new string[] { "Π", "Π" },
new string[] { "Ρ", "Ρ" },
new string[] { "Σ", "Σ" },
new string[] { "Τ", "Τ" },
new string[] { "Υ", "Υ" },
new string[] { "Φ", "Φ" },
new string[] { "Χ", "Χ" },
new string[] { "Ψ", "Ψ" },
new string[] { "Ω", "Ω" },
new string[] { "α", "α" },
new string[] { "β", "β" },
new string[] { "γ", "γ" },
new string[] { "δ", "δ" },
new string[] { "ε", "ε" },
new string[] { "ζ", "ζ" },
new string[] { "η", "η" },
new string[] { "θ", "θ" },
new string[] { "ι", "ι" },
new string[] { "κ", "κ" },
new string[] { "λ", "λ" },
new string[] { "μ", "μ" },
new string[] { "ν", "ν" },
new string[] { "ξ", "ξ" },
new string[] { "ο", "ο" },
new string[] { "π", "π" },
new string[] { "ρ", "ρ" },
new string[] { "ς", "ς" },
new string[] { "σ", "σ" },
new string[] { "τ", "τ" },
new string[] { "υ", "υ" },
new string[] { "φ", "φ" },
new string[] { "χ", "χ" },
new string[] { "ψ", "ψ" },
new string[] { "ω", "ω" },
new string[] { "ϑ", "ϑ" },
new string[] { "ϒ", "ϒ" },
new string[] { "ϖ", "ϖ" },
new string[] { " ", " " },
new string[] { " ", " " },
new string[] { " ", " " },
new string[] { "‌", "" },
new string[] { "‍", "" },
new string[] { "‎", "" },
new string[] { "‏", "" },
new string[] { "–", "–" },
new string[] { "—", "—" },
new string[] { "‘", "‘" },
new string[] { "’", "’" },
new string[] { "‚", "‚" },
new string[] { "“", "“" },
new string[] { "”", "”" },
new string[] { "„", "„" },
new string[] { "†", "†" },
new string[] { "‡", "‡" },
new string[] { "•", "•" },
new string[] { "…", "…" },
new string[] { "‰", "‰" },
new string[] { "′", "′" },
new string[] { "″", "″" },
new string[] { "‹", "‹" },
new string[] { "›", "›" },
new string[] { "‾", "‾" },
new string[] { "⁄", "⁄" },
new string[] { "€", "€" },
new string[] { "ℑ", "ℑ" },
new string[] { "℘", "℘" },
new string[] { "ℜ", "ℜ" },
new string[] { "™", "™" },
new string[] { "ℵ", "ℵ" },
new string[] { "←", "←" },
new string[] { "↑", "↑" },
new string[] { "→", "→" },
new string[] { "↓", "↓" },
new string[] { "↔", "↔" },
new string[] { "↵", "↵" },
new string[] { "⇐", "⇐" },
new string[] { "⇑", "⇑" },
new string[] { "⇒", "⇒" },
new string[] { "⇓", "⇓" },
new string[] { "⇔", "⇔" },
new string[] { "∀", "∀" },
new string[] { "∂", "∂" },
new string[] { "∃", "∃" },
new string[] { "∅", "∅" },
new string[] { "∇", "∇" },
new string[] { "∈", "∈" },
new string[] { "∉", "∉" },
new string[] { "∋", "∋" },
new string[] { "∏", "∏" },
new string[] { "∑", "∑" },
new string[] { "−", "−" },
new string[] { "∗", "∗" },
new string[] { "√", "√" },
new string[] { "∝", "∝" },
new string[] { "∞", "∞" },
new string[] { "∠", "∠" },
new string[] { "∧", "∧" },
new string[] { "∨", "∨" },
new string[] { "∩", "∩" },
new string[] { "∪", "∪" },
new string[] { "∫", "∫" },
new string[] { "∴", "∴" },
new string[] { "∼", "∼" },
new string[] { "≅", "≅" },
new string[] { "≈", "≈" },
new string[] { "≠", "≠" },
new string[] { "≡", "≡" },
new string[] { "≤", "≤" },
new string[] { "≥", "≥" },
new string[] { "⊂", "⊂" },
new string[] { "⊃", "⊃" },
new string[] { "⊄", "⊄" },
new string[] { "⊆", "⊆" },
new string[] { "⊇", "⊇" },
new string[] { "⊕", "⊕" },
new string[] { "⊗", "⊗" },
new string[] { "⊥", "⊥" },
new string[] { "⋅", "⋅" },
new string[] { "⌈", "⌈" },
new string[] { "⌉", "⌉" },
new string[] { "⌊", "⌊" },
new string[] { "⌋", "⌋" },
new string[] { "⟨", "〈" },
new string[] { "⟩", "〉" },
new string[] { "◊", "◊" },
new string[] { "♠", "♠" },
new string[] { "♣", "♣" },
new string[] { "♥", "♥" },
new string[] { "♦", "♦" },
new string[] { "&", "&" }
};
public static string HtmlStripTags(string htmlContent, bool replaceNamedEntities, bool replaceNumberedEntities)
{
if (htmlContent == null)
return null;
htmlContent = htmlContent.Trim();
if (htmlContent == string.Empty)
return string.Empty;
int bodyStartTagIdx = htmlContent.IndexOf("<body", StringComparison.CurrentCultureIgnoreCase);
int bodyEndTagIdx = htmlContent.IndexOf("</body>", StringComparison.CurrentCultureIgnoreCase);
int startIdx = 0, endIdx = htmlContent.Length - 1;
if (bodyStartTagIdx >= 0)
startIdx = bodyStartTagIdx;
if (bodyEndTagIdx >= 0)
endIdx = bodyEndTagIdx;
bool insideTag = false,
insideAttributeValue = false,
insideHtmlComment = false,
insideScriptBlock = false,
insideNoScriptBlock = false,
insideStyleBlock = false;
char attributeValueDelimiter = '"';
StringBuilder sb = new StringBuilder(htmlContent.Length);
for (int i = startIdx; i <= endIdx; i++)
{
if (i == 4163)
{
}
// html comment block
if (!insideHtmlComment)
{
if (i + 3 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '!' &&
htmlContent[i + 2] == '-' &&
htmlContent[i + 3] == '-')
{
i += 3;
insideHtmlComment = true;
continue;
}
}
else // inside html comment
{
if (i + 2 < htmlContent.Length &&
htmlContent[i] == '-' &&
htmlContent[i + 1] == '-' &&
htmlContent[i + 2] == '>')
{
i += 2;
insideHtmlComment = false;
continue;
}
else
continue;
}
// noscript block
if (!insideNoScriptBlock)
{
if (i + 9 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 'n' || htmlContent[i + 1] == 'N') &&
(htmlContent[i + 2] == 'o' || htmlContent[i + 2] == 'O') &&
(htmlContent[i + 3] == 's' || htmlContent[i + 3] == 'S') &&
(htmlContent[i + 4] == 'c' || htmlContent[i + 4] == 'C') &&
(htmlContent[i + 5] == 'r' || htmlContent[i + 5] == 'R') &&
(htmlContent[i + 6] == 'i' || htmlContent[i + 6] == 'I') &&
(htmlContent[i + 7] == 'p' || htmlContent[i + 7] == 'P') &&
(htmlContent[i + 8] == 't' || htmlContent[i + 8] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 9]) || htmlContent[i + 9] == '>'))
{
i += 9;
insideNoScriptBlock = true;
continue;
}
}
else // inside noscript block
{
if (i + 10 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 'n' || htmlContent[i + 2] == 'N') &&
(htmlContent[i + 3] == 'o' || htmlContent[i + 3] == 'O') &&
(htmlContent[i + 4] == 's' || htmlContent[i + 4] == 'S') &&
(htmlContent[i + 5] == 'c' || htmlContent[i + 5] == 'C') &&
(htmlContent[i + 6] == 'r' || htmlContent[i + 6] == 'R') &&
(htmlContent[i + 7] == 'i' || htmlContent[i + 7] == 'I') &&
(htmlContent[i + 8] == 'p' || htmlContent[i + 8] == 'P') &&
(htmlContent[i + 9] == 't' || htmlContent[i + 9] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 10]) || htmlContent[i + 10] == '>'))
{
if (htmlContent[i + 10] != '>')
{
i += 9;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 10;
insideNoScriptBlock = false;
}
continue;
}
// script block
if (!insideScriptBlock)
{
if (i + 7 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 's' || htmlContent[i + 1] == 'S') &&
(htmlContent[i + 2] == 'c' || htmlContent[i + 2] == 'C') &&
(htmlContent[i + 3] == 'r' || htmlContent[i + 3] == 'R') &&
(htmlContent[i + 4] == 'i' || htmlContent[i + 4] == 'I') &&
(htmlContent[i + 5] == 'p' || htmlContent[i + 5] == 'P') &&
(htmlContent[i + 6] == 't' || htmlContent[i + 6] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 7]) || htmlContent[i + 7] == '>'))
{
i += 6;
insideScriptBlock = true;
continue;
}
}
else // inside script block
{
if (i + 8 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 's' || htmlContent[i + 2] == 'S') &&
(htmlContent[i + 3] == 'c' || htmlContent[i + 3] == 'C') &&
(htmlContent[i + 4] == 'r' || htmlContent[i + 4] == 'R') &&
(htmlContent[i + 5] == 'i' || htmlContent[i + 5] == 'I') &&
(htmlContent[i + 6] == 'p' || htmlContent[i + 6] == 'P') &&
(htmlContent[i + 7] == 't' || htmlContent[i + 7] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 8]) || htmlContent[i + 8] == '>'))
{
if (htmlContent[i + 8] != '>')
{
i += 7;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 8;
insideScriptBlock = false;
}
continue;
}
// style block
if (!insideStyleBlock)
{
if (i + 7 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 's' || htmlContent[i + 1] == 'S') &&
(htmlContent[i + 2] == 't' || htmlContent[i + 2] == 'T') &&
(htmlContent[i + 3] == 'y' || htmlContent[i + 3] == 'Y') &&
(htmlContent[i + 4] == 'l' || htmlContent[i + 4] == 'L') &&
(htmlContent[i + 5] == 'e' || htmlContent[i + 5] == 'E') &&
(char.IsWhiteSpace(htmlContent[i + 6]) || htmlContent[i + 6] == '>'))
{
i += 5;
insideStyleBlock = true;
continue;
}
}
else // inside script block
{
if (i + 8 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 's' || htmlContent[i + 2] == 'S') &&
(htmlContent[i + 3] == 't' || htmlContent[i + 3] == 'C') &&
(htmlContent[i + 4] == 'y' || htmlContent[i + 4] == 'R') &&
(htmlContent[i + 5] == 'l' || htmlContent[i + 5] == 'I') &&
(htmlContent[i + 6] == 'e' || htmlContent[i + 6] == 'P') &&
(char.IsWhiteSpace(htmlContent[i + 7]) || htmlContent[i + 7] == '>'))
{
if (htmlContent[i + 7] != '>')
{
i += 7;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 7;
insideStyleBlock = false;
}
continue;
}
if (!insideTag)
{
if (i < htmlContent.Length &&
htmlContent[i] == '<')
{
insideTag = true;
continue;
}
}
else // inside tag
{
if (!insideAttributeValue)
{
if (htmlContent[i] == '"' || htmlContent[i] == '\'')
{
attributeValueDelimiter = htmlContent[i];
insideAttributeValue = true;
continue;
}
if (htmlContent[i] == '>')
{
insideTag = false;
sb.Append(' '); // prevent words from different tags (<td>s for example) from joining together
continue;
}
}
else // inside tag and inside attribute value
{
if (htmlContent[i] == attributeValueDelimiter)
{
insideAttributeValue = false;
continue;
}
}
continue;
}
sb.Append(htmlContent[i]);
}
if (replaceNamedEntities)
foreach (string[] htmlNamedEntity in htmlNamedEntities)
sb.Replace(htmlNamedEntity[0], htmlNamedEntity[1]);
if (replaceNumberedEntities)
for (int i = 0; i < 512; i++)
sb.Replace("&#" + i + ";", ((char)i).ToString());
return sb.ToString();
}
}
}
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace HtmlStripTags
{
class HtmlHelper
{
private static readonly string[][] htmlNamedEntities = new string[][] {
new string[] { """, "\"" },
new string[] { "<", "<" },
new string[] { ">", ">" },
new string[] { " ", " " },
new string[] { "¡", "¡" },
new string[] { "¢", "¢" },
new string[] { "£", "£" },
new string[] { "¤", "¤" },
new string[] { "¥", "¥" },
new string[] { "¦", "¦" },
new string[] { "§", "§" },
new string[] { "¨", "¨" },
new string[] { "©", "©" },
new string[] { "ª", "ª" },
new string[] { "«", "«" },
new string[] { "¬", "¬" },
new string[] { "­", "" },
new string[] { "®", "®" },
new string[] { "¯", "¯" },
new string[] { "°", "°" },
new string[] { "±", "±" },
new string[] { "²", "²" },
new string[] { "³", "³" },
new string[] { "´", "´" },
new string[] { "µ", "µ" },
new string[] { "¶", "¶" },
new string[] { "·", "·" },
new string[] { "¸", "¸" },
new string[] { "¹", "¹" },
new string[] { "º", "º" },
new string[] { "»", " »" },
new string[] { "¼", "¼" },
new string[] { "½", "½" },
new string[] { "¾", "¾" },
new string[] { "¿", "¿" },
new string[] { "À", "À" },
new string[] { "Á", "Á" },
new string[] { "Â", "Â" },
new string[] { "Ã", "Ã" },
new string[] { "Ä", "Ä" },
new string[] { "Å", "Å" },
new string[] { "Æ", "Æ" },
new string[] { "Ç", "Ç" },
new string[] { "È", "È" },
new string[] { "É", "É" },
new string[] { "Ê", "Ê" },
new string[] { "Ë", "Ë" },
new string[] { "Ì", "Ì" },
new string[] { "Í", "Í" },
new string[] { "Î", "Î" },
new string[] { "Ï", "Ï" },
new string[] { "Ð", "Ð" },
new string[] { "Ñ", "Ñ" },
new string[] { "Ò", "Ò" },
new string[] { "Ó", "Ó" },
new string[] { "Ô", "Ô" },
new string[] { "Õ", "Õ" },
new string[] { "Ö", "Ö" },
new string[] { "×", "×" },
new string[] { "Ø", "Ø" },
new string[] { "Ù", "Ù" },
new string[] { "Ú", "Ú" },
new string[] { "Û", "Û" },
new string[] { "Ü", "Ü" },
new string[] { "Ý", "Ý" },
new string[] { "Þ", "Þ" },
new string[] { "ß", "ß" },
new string[] { "à", "à" },
new string[] { "á", "á" },
new string[] { "â", "â" },
new string[] { "ã", "ã" },
new string[] { "ä", "ä" },
new string[] { "å", "å" },
new string[] { "æ", "æ" },
new string[] { "ç", "ç" },
new string[] { "è", "è" },
new string[] { "é", "é" },
new string[] { "ê", "ê" },
new string[] { "ë", "ë" },
new string[] { "ì", "ì" },
new string[] { "í", "í" },
new string[] { "î", "î" },
new string[] { "ï", "ï" },
new string[] { "ð", "ð" },
new string[] { "ñ", "ñ" },
new string[] { "ò", "ò" },
new string[] { "ó", "ó" },
new string[] { "ô", "ô" },
new string[] { "õ", "õ" },
new string[] { "ö", "ö" },
new string[] { "÷", "÷" },
new string[] { "ø", "ø" },
new string[] { "ù", "ù" },
new string[] { "ú", "ú" },
new string[] { "û", "û" },
new string[] { "ü", "ü" },
new string[] { "ý", "ý" },
new string[] { "þ", "þ" },
new string[] { "ÿ", "ÿ" },
new string[] { "Œ", "Œ" },
new string[] { "œ", "œ" },
new string[] { "Š", "Š" },
new string[] { "š", "š" },
new string[] { "Ÿ", "Ÿ" },
new string[] { "ƒ", "ƒ" },
new string[] { "ˆ", "ˆ" },
new string[] { "˜", "˜" },
new string[] { "Α", "Α" },
new string[] { "Β", "Β" },
new string[] { "Γ", "Γ" },
new string[] { "Δ", "Δ" },
new string[] { "Ε", "Ε" },
new string[] { "Ζ", "Ζ" },
new string[] { "Η", "Η" },
new string[] { "Θ", "Θ" },
new string[] { "Ι", "Ι" },
new string[] { "Κ", "Κ" },
new string[] { "Λ", "Λ" },
new string[] { "Μ", "Μ" },
new string[] { "Ν", "Ν" },
new string[] { "Ξ", "Ξ" },
new string[] { "Ο", "Ο" },
new string[] { "Π", "Π" },
new string[] { "Ρ", "Ρ" },
new string[] { "Σ", "Σ" },
new string[] { "Τ", "Τ" },
new string[] { "Υ", "Υ" },
new string[] { "Φ", "Φ" },
new string[] { "Χ", "Χ" },
new string[] { "Ψ", "Ψ" },
new string[] { "Ω", "Ω" },
new string[] { "α", "α" },
new string[] { "β", "β" },
new string[] { "γ", "γ" },
new string[] { "δ", "δ" },
new string[] { "ε", "ε" },
new string[] { "ζ", "ζ" },
new string[] { "η", "η" },
new string[] { "θ", "θ" },
new string[] { "ι", "ι" },
new string[] { "κ", "κ" },
new string[] { "λ", "λ" },
new string[] { "μ", "μ" },
new string[] { "ν", "ν" },
new string[] { "ξ", "ξ" },
new string[] { "ο", "ο" },
new string[] { "π", "π" },
new string[] { "ρ", "ρ" },
new string[] { "ς", "ς" },
new string[] { "σ", "σ" },
new string[] { "τ", "τ" },
new string[] { "υ", "υ" },
new string[] { "φ", "φ" },
new string[] { "χ", "χ" },
new string[] { "ψ", "ψ" },
new string[] { "ω", "ω" },
new string[] { "ϑ", "ϑ" },
new string[] { "ϒ", "ϒ" },
new string[] { "ϖ", "ϖ" },
new string[] { " ", " " },
new string[] { " ", " " },
new string[] { " ", " " },
new string[] { "‌", "" },
new string[] { "‍", "" },
new string[] { "‎", "" },
new string[] { "‏", "" },
new string[] { "–", "–" },
new string[] { "—", "—" },
new string[] { "‘", "‘" },
new string[] { "’", "’" },
new string[] { "‚", "‚" },
new string[] { "“", "“" },
new string[] { "”", "”" },
new string[] { "„", "„" },
new string[] { "†", "†" },
new string[] { "‡", "‡" },
new string[] { "•", "•" },
new string[] { "…", "…" },
new string[] { "‰", "‰" },
new string[] { "′", "′" },
new string[] { "″", "″" },
new string[] { "‹", "‹" },
new string[] { "›", "›" },
new string[] { "‾", "‾" },
new string[] { "⁄", "⁄" },
new string[] { "€", "€" },
new string[] { "ℑ", "ℑ" },
new string[] { "℘", "℘" },
new string[] { "ℜ", "ℜ" },
new string[] { "™", "™" },
new string[] { "ℵ", "ℵ" },
new string[] { "←", "←" },
new string[] { "↑", "↑" },
new string[] { "→", "→" },
new string[] { "↓", "↓" },
new string[] { "↔", "↔" },
new string[] { "↵", "↵" },
new string[] { "⇐", "⇐" },
new string[] { "⇑", "⇑" },
new string[] { "⇒", "⇒" },
new string[] { "⇓", "⇓" },
new string[] { "⇔", "⇔" },
new string[] { "∀", "∀" },
new string[] { "∂", "∂" },
new string[] { "∃", "∃" },
new string[] { "∅", "∅" },
new string[] { "∇", "∇" },
new string[] { "∈", "∈" },
new string[] { "∉", "∉" },
new string[] { "∋", "∋" },
new string[] { "∏", "∏" },
new string[] { "∑", "∑" },
new string[] { "−", "−" },
new string[] { "∗", "∗" },
new string[] { "√", "√" },
new string[] { "∝", "∝" },
new string[] { "∞", "∞" },
new string[] { "∠", "∠" },
new string[] { "∧", "∧" },
new string[] { "∨", "∨" },
new string[] { "∩", "∩" },
new string[] { "∪", "∪" },
new string[] { "∫", "∫" },
new string[] { "∴", "∴" },
new string[] { "∼", "∼" },
new string[] { "≅", "≅" },
new string[] { "≈", "≈" },
new string[] { "≠", "≠" },
new string[] { "≡", "≡" },
new string[] { "≤", "≤" },
new string[] { "≥", "≥" },
new string[] { "⊂", "⊂" },
new string[] { "⊃", "⊃" },
new string[] { "⊄", "⊄" },
new string[] { "⊆", "⊆" },
new string[] { "⊇", "⊇" },
new string[] { "⊕", "⊕" },
new string[] { "⊗", "⊗" },
new string[] { "⊥", "⊥" },
new string[] { "⋅", "⋅" },
new string[] { "⌈", "⌈" },
new string[] { "⌉", "⌉" },
new string[] { "⌊", "⌊" },
new string[] { "⌋", "⌋" },
new string[] { "⟨", "〈" },
new string[] { "⟩", "〉" },
new string[] { "◊", "◊" },
new string[] { "♠", "♠" },
new string[] { "♣", "♣" },
new string[] { "♥", "♥" },
new string[] { "♦", "♦" },
new string[] { "&", "&" }
};
public static string HtmlStripTags(string htmlContent, bool replaceNamedEntities, bool replaceNumberedEntities)
{
if (htmlContent == null)
return null;
htmlContent = htmlContent.Trim();
if (htmlContent == string.Empty)
return string.Empty;
int bodyStartTagIdx = htmlContent.IndexOf("<body", StringComparison.CurrentCultureIgnoreCase);
int bodyEndTagIdx = htmlContent.IndexOf("</body>", StringComparison.CurrentCultureIgnoreCase);
int startIdx = 0, endIdx = htmlContent.Length - 1;
if (bodyStartTagIdx >= 0)
startIdx = bodyStartTagIdx;
if (bodyEndTagIdx >= 0)
endIdx = bodyEndTagIdx;
bool insideTag = false,
insideAttributeValue = false,
insideHtmlComment = false,
insideScriptBlock = false,
insideNoScriptBlock = false,
insideStyleBlock = false;
char attributeValueDelimiter = '"';
StringBuilder sb = new StringBuilder(htmlContent.Length);
for (int i = startIdx; i <= endIdx; i++)
{
if (i == 4163)
{
}
// html comment block
if (!insideHtmlComment)
{
if (i + 3 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '!' &&
htmlContent[i + 2] == '-' &&
htmlContent[i + 3] == '-')
{
i += 3;
insideHtmlComment = true;
continue;
}
}
else // inside html comment
{
if (i + 2 < htmlContent.Length &&
htmlContent[i] == '-' &&
htmlContent[i + 1] == '-' &&
htmlContent[i + 2] == '>')
{
i += 2;
insideHtmlComment = false;
continue;
}
else
continue;
}
// noscript block
if (!insideNoScriptBlock)
{
if (i + 9 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 'n' || htmlContent[i + 1] == 'N') &&
(htmlContent[i + 2] == 'o' || htmlContent[i + 2] == 'O') &&
(htmlContent[i + 3] == 's' || htmlContent[i + 3] == 'S') &&
(htmlContent[i + 4] == 'c' || htmlContent[i + 4] == 'C') &&
(htmlContent[i + 5] == 'r' || htmlContent[i + 5] == 'R') &&
(htmlContent[i + 6] == 'i' || htmlContent[i + 6] == 'I') &&
(htmlContent[i + 7] == 'p' || htmlContent[i + 7] == 'P') &&
(htmlContent[i + 8] == 't' || htmlContent[i + 8] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 9]) || htmlContent[i + 9] == '>'))
{
i += 9;
insideNoScriptBlock = true;
continue;
}
}
else // inside noscript block
{
if (i + 10 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 'n' || htmlContent[i + 2] == 'N') &&
(htmlContent[i + 3] == 'o' || htmlContent[i + 3] == 'O') &&
(htmlContent[i + 4] == 's' || htmlContent[i + 4] == 'S') &&
(htmlContent[i + 5] == 'c' || htmlContent[i + 5] == 'C') &&
(htmlContent[i + 6] == 'r' || htmlContent[i + 6] == 'R') &&
(htmlContent[i + 7] == 'i' || htmlContent[i + 7] == 'I') &&
(htmlContent[i + 8] == 'p' || htmlContent[i + 8] == 'P') &&
(htmlContent[i + 9] == 't' || htmlContent[i + 9] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 10]) || htmlContent[i + 10] == '>'))
{
if (htmlContent[i + 10] != '>')
{
i += 9;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 10;
insideNoScriptBlock = false;
}
continue;
}
// script block
if (!insideScriptBlock)
{
if (i + 7 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 's' || htmlContent[i + 1] == 'S') &&
(htmlContent[i + 2] == 'c' || htmlContent[i + 2] == 'C') &&
(htmlContent[i + 3] == 'r' || htmlContent[i + 3] == 'R') &&
(htmlContent[i + 4] == 'i' || htmlContent[i + 4] == 'I') &&
(htmlContent[i + 5] == 'p' || htmlContent[i + 5] == 'P') &&
(htmlContent[i + 6] == 't' || htmlContent[i + 6] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 7]) || htmlContent[i + 7] == '>'))
{
i += 6;
insideScriptBlock = true;
continue;
}
}
else // inside script block
{
if (i + 8 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 's' || htmlContent[i + 2] == 'S') &&
(htmlContent[i + 3] == 'c' || htmlContent[i + 3] == 'C') &&
(htmlContent[i + 4] == 'r' || htmlContent[i + 4] == 'R') &&
(htmlContent[i + 5] == 'i' || htmlContent[i + 5] == 'I') &&
(htmlContent[i + 6] == 'p' || htmlContent[i + 6] == 'P') &&
(htmlContent[i + 7] == 't' || htmlContent[i + 7] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 8]) || htmlContent[i + 8] == '>'))
{
if (htmlContent[i + 8] != '>')
{
i += 7;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 8;
insideScriptBlock = false;
}
continue;
}
// style block
if (!insideStyleBlock)
{
if (i + 7 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 's' || htmlContent[i + 1] == 'S') &&
(htmlContent[i + 2] == 't' || htmlContent[i + 2] == 'T') &&
(htmlContent[i + 3] == 'y' || htmlContent[i + 3] == 'Y') &&
(htmlContent[i + 4] == 'l' || htmlContent[i + 4] == 'L') &&
(htmlContent[i + 5] == 'e' || htmlContent[i + 5] == 'E') &&
(char.IsWhiteSpace(htmlContent[i + 6]) || htmlContent[i + 6] == '>'))
{
i += 5;
insideStyleBlock = true;
continue;
}
}
else // inside script block
{
if (i + 8 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 's' || htmlContent[i + 2] == 'S') &&
(htmlContent[i + 3] == 't' || htmlContent[i + 3] == 'C') &&
(htmlContent[i + 4] == 'y' || htmlContent[i + 4] == 'R') &&
(htmlContent[i + 5] == 'l' || htmlContent[i + 5] == 'I') &&
(htmlContent[i + 6] == 'e' || htmlContent[i + 6] == 'P') &&
(char.IsWhiteSpace(htmlContent[i + 7]) || htmlContent[i + 7] == '>'))
{
if (htmlContent[i + 7] != '>')
{
i += 7;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 7;
insideStyleBlock = false;
}
continue;
}
if (!insideTag)
{
if (i < htmlContent.Length &&
htmlContent[i] == '<')
{
insideTag = true;
continue;
}
}
else // inside tag
{
if (!insideAttributeValue)
{
if (htmlContent[i] == '"' || htmlContent[i] == '\'')
{
attributeValueDelimiter = htmlContent[i];
insideAttributeValue = true;
continue;
}
if (htmlContent[i] == '>')
{
insideTag = false;
sb.Append(' '); // prevent words from different tags (<td>s for example) from joining together
continue;
}
}
else // inside tag and inside attribute value
{
if (htmlContent[i] == attributeValueDelimiter)
{
insideAttributeValue = false;
continue;
}
}
continue;
}
sb.Append(htmlContent[i]);
}
if (replaceNamedEntities)
foreach (string[] htmlNamedEntity in htmlNamedEntities)
sb.Replace(htmlNamedEntity[0], htmlNamedEntity[1]);
if (replaceNumberedEntities)
for (int i = 0; i < 512; i++)
sb.Replace("&#" + i + ";", ((char)i).ToString());
return sb.ToString();
}
}
}