Search Tools Links Login

HTML2Text


Visual Basic 6, or VB Classic

This code is for converting a string with HTML tags and encodings into a text-only string. It rids multiple spaces and supports ALL encoded characters like ",  , Ä and so on.

Original Author: unknown

Inputs

The original HTML String OrigHTML$

Returns

Text-only string

Code

Public Function HTML2Text(ByVal OrigHTML$) As String
On Error Resume Next
If InStr(LCase$(OrigHTML$), " 0 Then
OrigHTML$ = Mid$(OrigHTML$, InStr(LCase$(OrigHTML$), " OrigHTML$ = Mid$(OrigHTML$, InStr(OrigHTML$, ">") + 1)
If InStr(LCase$(OrigHTML$), "") > 0 Then _
  OrigHTML$ = Left$(OrigHTML$, InStr(LCase$(OrigHTML$), "") - 1)
End If
Do While Len(OrigHTML$)
CurrChar$ = Left$(OrigHTML$, 1)
OrigHTML$ = Mid$(OrigHTML$, 2)
Select Case CurrChar$
Case " "
  OrigHTML$ = LTrim$(OrigHTML$)
Case vbCr, vbLf
  CurrChar$ = ""
  If Left$(OrigHTML$, 1) = vbLf Then OrigHTML$ = Mid$(OrigHTML$, 2)
  OrigHTML$ = LTrim$(OrigHTML$)
Case "<"
  CurrChar$ = ""
  If InStr(OrigHTML$, ">") > 0 Then
   CurrChar$ = Left$(OrigHTML$, InStr(OrigHTML$, ">") - 1)
   OrigHTML$ = Mid$(OrigHTML$, InStr(OrigHTML$, ">") + 1)
  
   Select Case LCase$(CurrChar$)
   Case "p", "/div"
    CurrChar$ = vbCrLf + vbCrLf
   Case "br"
    CurrChar$ = vbCrLf
   Case Else
    CurrChar$ = ""
   End Select
  End If
Case "&"
  If InStr(OrigHTML$, ";") > 0 And InStr(OrigHTML$, ";") < InStr(OrigHTML$, " ") Then
   CurrChar$ = Left$(OrigHTML$, InStr(OrigHTML$, ";") - 1)
   OrigHTML$ = Mid$(OrigHTML$, InStr(OrigHTML$, ";") + 1)
  
   Select Case CurrChar$
   Case "amp"
    CurrChar$ = "&"
   Case "quot"
    CurrChar$ = """"
   Case "lt"
    CurrChar$ = "<"
   Case "gt"
    CurrChar$ = ">"
   Case "nbsp"
    CurrChar$ = " "
   Case "Auml"
    CurrChar$ = "?ä"
   Case "auml"
    CurrChar$ = "?ñ"
   Case "iexcl"
    CurrChar$ = "?í"
   Case "cent"
    CurrChar$ = "?ó"
   Case "pound"
    CurrChar$ = "?ú"
   Case "curren"
    CurrChar$ = "?ñ"
   Case "yen"
    CurrChar$ = "?Ñ"
   Case "brvbar"
    CurrChar$ = "|"
   Case "sect"
    CurrChar$ = "?º"
   Case "uml"
    CurrChar$ = "?¿"
   Case "copy"
    CurrChar$ = "?®"
   Case "ordf"
    CurrChar$ = "?¬"
   Case "laquo"
    CurrChar$ = "?½"
   Case "not"
    CurrChar$ = "?¼"
   Case "reg"
    CurrChar$ = "?«"
   Case "macr"
    CurrChar$ = "?»"
   Case "deg"
    CurrChar$ = "??"
   Case "plusm"
    CurrChar$ = "??"
   Case "sup2"
    CurrChar$ = "??"
   Case "sup3"
    CurrChar$ = "??"
   Case "acute"
    CurrChar$ = "??"
   Case "micro"
    CurrChar$ = "?Á"
   Case "para"
    CurrChar$ = "?Â"
   Case "middot"
    CurrChar$ = "?À"
   Case "cedil"
    CurrChar$ = "?©"
   Case "sup1"
    CurrChar$ = "??"
   Case "ordm"
    CurrChar$ = "??"
   Case "raquo"
    CurrChar$ = "??"
   Case "frac14"
    CurrChar$ = "??"
   Case "frac12"
    CurrChar$ = "?¢"
   Case "frac34"
    CurrChar$ = "?¥"
   Case "iquest"
    CurrChar$ = "??"
   Case "Agrave"
    CurrChar$ = "?Ç"
   Case "Aacute"
    CurrChar$ = "?ü"
   Case "Acirc"
    CurrChar$ = "?é"
   Case "Atilde"
    CurrChar$ = "?â"
   Case "Aring"
    CurrChar$ = "?à"
   Case "AElig"
    CurrChar$ = "?å"
   Case "Ccedil"
    CurrChar$ = "?ç"
   Case "Egrave"
    CurrChar$ = "?ê"
   Case "Eacute"
    CurrChar$ = "?ë"
   Case "Ecirc"
    CurrChar$ = "?è"
   Case "Euml"
    CurrChar$ = "?ï"
   Case "Igrave"
    CurrChar$ = "?î"
   Case "Iacute"
    CurrChar$ = "?ì"
   Case "Icirc"
    CurrChar$ = "?Ä"
   Case "Iuml"
    CurrChar$ = "?Å"
   Case "ETH"
    CurrChar$ = "?É"
   Case "Ntilde"
    CurrChar$ = "?æ"
   Case "Ograve"
    CurrChar$ = "?Æ"
   Case "Oacute"
    CurrChar$ = "?ô"
   Case "Ocirc"
    CurrChar$ = "?ö"
   Case "Otilde"
    CurrChar$ = "?ò"
   Case "Ouml"
    CurrChar$ = "?û"
   Case "times"
    CurrChar$ = "?ù"
   Case "Oslash"
    CurrChar$ = "?ÿ"
   Case "Ugrave"
    CurrChar$ = "?Ö"
   Case "Uacute"
    CurrChar$ = "?Ü"
   Case "Ucirc"
    CurrChar$ = "?ø"
   Case "Uuml"
    CurrChar$ = "?£"
   Case "Yacute"
    CurrChar$ = "?Ø"
   Case "THORN"
    CurrChar$ = "?×"
   Case "szlig"
    CurrChar$ = "?ƒ"
   Case "agrave"
    CurrChar$ = "?á"
   Case "aacute"
    CurrChar$ = "?í"
   Case "acirc"
    CurrChar$ = "?ó"
   Case "atilde"
    CurrChar$ = "?ú"
   Case "aring"
    CurrChar$ = "?Ñ"
   Case "aelig"
    CurrChar$ = "?ª"
   Case "ccedil"
    CurrChar$ = "?º"
   Case "egrave"
    CurrChar$ = "?¿"
   Case "eacute"
    CurrChar$ = "?®"
   Case "ecirc"
    CurrChar$ = "?¬"
   Case "euml"
    CurrChar$ = "?½"
   Case "igrave"
    CurrChar$ = "?¼"
   Case "iacute"
    CurrChar$ = "?¡"
   Case "icirc"
    CurrChar$ = "?«"
   Case "iuml"
    CurrChar$ = "?»"
   Case "eth"
    CurrChar$ = "??"
   Case "ntilde"
    CurrChar$ = "??"
   Case "ograve"
    CurrChar$ = "??"
   Case "oacute"
    CurrChar$ = "??"
   Case "ocirc"
    CurrChar$ = "??"
   Case "otilde"
    CurrChar$ = "?Á"
   Case "ouml"
    CurrChar$ = "?Â"
   Case "divide"
    CurrChar$ = "?À"
   Case "oslash"
    CurrChar$ = "?©"
   Case "ugrave"
    CurrChar$ = "??"
   Case "uacute"
    CurrChar$ = "??"
   Case "ucirc"
    CurrChar$ = "??"
   Case "uuml"
    CurrChar$ = "??"
   Case "yacute"
    CurrChar$ = "?¢"
   Case "thorn"
    CurrChar$ = "?¥"
   Case "yuml"
    CurrChar$ = "??"
   Case Else
    CurrChar$ = "&" + CurrChar$ + ";"
   End Select
  End If
End Select
NoHTML$ = NoHTML$ + CurrChar$
Loop
HTML2Text = NoHTML$
End Function

About this post

Posted: 2002-06-01
By: ArchiveBot
Viewed: 115 times

Categories

Visual Basic 6

Attachments

No attachments for this post


Loading Comments ...

Comments

No comments have been added for this post.

You must be logged in to make a comment.