Friday, August 14, 2015

How To Convert HTML To Formatted Plain Text

Simple removing of HTML tags with Regex.Replace method

Sometimes you want to remove tags from HTML and get only plain text. In general, this is simple task but there are few drawbacks in some scenarios. The simplest solution is to just remove all tags from given HTML without any formatting.

using System.Text.RegularExpressions;
Then...

// Below code is used for .NET sites
private string StripHtml(string source)
{
        string output;

  // Replace &lt; with < symblol and &gt; with > symblol
        source = source.Replace("&lt;", "<").Replace("&gt;", ">");

        //get rid of HTML tags
        output = Regex.Replace(source, "<[^>]*>", string.Empty);
  output = Regex.Replace(output, "&nbsp;", "");

  //get rid of multiple blank lines
        output = Regex.Replace(output, @"^\s*$\n", string.Empty, RegexOptions.Multiline);
        return output;
}

// Below code is used for SharePoint sites
private string StripHtml(string str)
{

        str = Regex.Replace(str, "&lt;", "<");
        str = Regex.Replace(str, "&gt;", ">");
        str= SPHttpUtility.ConvertSimpleHtmlToText(str, -1);
        str= Regex.Replace(str, "&nbsp;", "");
        str = Regex.Replace(str, @"^\s*$\n", string.Empty, RegexOptions.Multiline);
      
       return  str;                 
}