How To Convert HTML to Text, Easily
Whether
you want to convert an HTML page into pure text so you can parse out
that special piece of information, or you simply want to load a page
from the Net into your own word processing package, this mini function
could come in handy.
It’s called StripTags and accepts an HTML string. Using a regular expression, it identifies all <tags>, removes them, and returns the modified string. Here’s the code:
<%@ Page Language="C#" ValidateRequest="False" AutoEventWireup="true" CodeFile="StripTag.aspx.cs" Inherits="StripTag" %> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head runat="server"> <title>Untitled Page</title> </head> <body> <form id="form1" runat="server"> <div> <asp:TextBox ID="TextBox1" runat="server" Height="172px" Width="363px" TextMode="MultiLine"></asp:TextBox></div> <asp:Button ID="Button1" runat="server" Text="Button" OnClick="Button1_Click" /> <asp:Label ID="Label1" runat="server" Text="Label"></asp:Label> </form> </body> </html>
using System; using System.Data; using System.Configuration; using System.Collections; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls; public partial class StripTag : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { } public string StripTags(string HTML) { // Removes tags from passed HTML return System.Text.RegularExpressions.Regex.Replace(HTML, "<[^>]*>", ""); } protected void Button1_Click(object sender, EventArgs e) { Label1.Text = StripTags(TextBox1.Text); } }
0 comments: