1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
| Imports System.Net
Imports System.IO
Public Class Form1
Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
'Le code suivant utilise une Form avec un contrôle TextBox nommé textBox1, un contrôle RichTextBox nommé richTextBox1 et un contrôle Button nommé button1.
Dim HttpWResponse As HttpWebResponse = Nothing
Dim sr As StreamReader = Nothing
Dim zz As String
Try
Dim HttpWRequest As HttpWebRequest = CType(WebRequest.Create(TextBox1.Text), HttpWebRequest)
HttpWResponse = CType(HttpWRequest.GetResponse, HttpWebResponse)
sr = New StreamReader(HttpWResponse.GetResponseStream)
zz = sr.ReadToEnd
RichTextBox1.Text = zz
Catch ex As Exception
MessageBox.Show(ex.Message)
Finally
If Not (HttpWResponse Is Nothing) Then
HttpWResponse.Close()
End If
If Not (sr Is Nothing) Then
sr.Close()
End If
End Try
Dim result As String
Dim source As String
result = " "
source = RichTextBox1.Text
htmltotxt(result, source)
RichTextBox1.Text = result
End Sub
Private Sub htmltotxt(ByRef result As String, ByRef source As String)
MsgBox("Ici HtmlToTxt")
'' Remove HTML Development formatting
'' Replace line breaks with space
'' because browsers inserts space
result = source.Replace("\r", " ")
'' Replace line breaks with space
'' because browsers inserts space
result = result.Replace("\n", " ")
'' Remove step-formatting
result = result.Replace("\t", String.Empty)
'' Remove repeating speces becuase browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result, "( )+", " ")
'' Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<script>).*(</script>)", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' insert line paragraphs (double line breaks) in place
'' if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove remaining tags like <a>, links, images,
'' comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result, "<[^>]*>", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
''replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result, " ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "•", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "‹", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "›", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "™", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "⁄", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, ">", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "©", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "®", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove all others. More can be added, see
'' http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result, "&(.{2,6});", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' for testng
''System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' make line breaking consistent
result = result.Replace("\n", "\r")
'' Remove extra line breaks and tabs:
'' replace over 2 breaks with 2 and over 4 tabs with 4.
'' Prepare first to remove any whitespaces inbetween
'' the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove multible tabs followind a linebreak with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Initial replacement target string for linebreaks
Dim breaks As String = "\r\r\r"
'' Initial replacement target string for tabs
Dim tabs As String = "\t\t\t\t\t"
Dim index As Integer
For index = 0 To index < result.Length
result = result.Replace(breaks, "\r\r")
result = result.Replace(tabs, "\t\t\t\t")
breaks = breaks + "\r"
tabs = tabs + "\t"
Next
End Sub
Private Sub TextBox1_TextChanged(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles TextBox1.TextChanged
End Sub
Private Sub TextBox2_TextChanged(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles TextBox2.TextChanged
End Sub
Private Sub RichTextBox1_TextChanged(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles RichTextBox1.TextChanged
End Sub
Private Sub LinkLabel1_LinkClicked(ByVal sender As System.Object, ByVal e As System.Windows.Forms.LinkLabelLinkClickedEventArgs) Handles LinkLabel1.LinkClicked
End Sub
Private Sub PictureBox1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles PictureBox1.Click
End Sub
Private Sub Form1_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
End Sub
End Class |