Search Tools Links Login

Huffman Compression - for MS Access


Uses Fredrik Qvarfort's Huffman Encoding/Decoding Class in an MS Access database as a module.

Original Author: Chad M. Kovac

Assumptions

Encoding Time:
Inputfile: 3,114,776
Outputfile: 1,781,030 (57%)
Time: 16.92 seconds
Decoding (same file)
Time: 15.60 seconds
Scond File (small text file) Encoding:
Inputfile: 18548
Outputfile: 11608
Time: .121 seconds
Decoded file: .119 seconds
Using an MS Access .MDE database:
File 1:
Encoded: 15.362 Seconds
Decoded: 13.719 Seconds
File 2:
Encoded: .11 Seconds
Decoded: .009 Seconds
So performance is improved a bit using the .MDE file rather than the .MDB file. I have not attempted to create a .DLL or Class object and link to it or reference to it from MS Access yet. I would think a .DLL would be even faster - however, still not attaining the speeds of a pure VB compiled class.
The test machine is a PIII 700 with 512 Megs of Ram running Windows 2000 SP2.

Code

'Huffman Encoding/Decoding Class
'-------------------------------
'
'(c) 2000, Fredrik Qvarfort
'
'Modified for MS Access by Chad M. Kovac
'http://www.TechKnowledgeyInc.com
'03/13/2002
Option Explicit
'Progress Values for the encoding routine
Private Const PROGRESS_CALCFREQUENCY = 7
Private Const PROGRESS_CALCCRC = 5
Private Const PROGRESS_ENCODING = 88
'Progress Values for the decoding routine
Private Const PROGRESS_DECODING = 89
Private Const PROGRESS_CHECKCRC = 11
'Events
'Event Progress(Procent As Integer)
Private Type HUFFMANTREE
ParentNode As Integer
RightNode As Integer
LeftNode As Integer
Value As Integer
Weight As Long
End Type
Private Type ByteArray
Count As Byte
Data() As Byte
End Type
Private Declare Sub CopyMem Lib "kernel32" Alias "RtlMoveMemory" (Destination As Any, Source As Any, ByVal Length As Long)
Public Sub EncodeFile(SourceFile As String, DestFile As String)
Dim ByteArray() As Byte
Dim Filenr As Integer

'Make sure the source file exists
If (Not FileExist(SourceFile)) Then
Err.Raise vbObjectError, "clsHuffman.EncodeFile()", "Source file does not exist"
End If

'Read the data from the sourcefile
Filenr = FreeFile
Open SourceFile For Binary As #Filenr
ReDim ByteArray(0 To LOF(Filenr) - 1)
Get #Filenr, , ByteArray()
Close #Filenr

'Compress the data
Call EncodeByte(ByteArray(), UBound(ByteArray) + 1)

'If the destination file exist we need to
'destroy it because opening it as binary
'will not clear the old data
If (FileExist(DestFile)) Then Kill DestFile

'Save the destination string
Open DestFile For Binary As #Filenr
Put #Filenr, , ByteArray()
Close #Filenr
End Sub
Public Sub DecodeFile(SourceFile As String, DestFile As String)
Dim ByteArray() As Byte
Dim Filenr As Integer

'Make sure the source file exists
If (Not FileExist(SourceFile)) Then
Err.Raise vbObjectError, "clsHuffman.DecodeFile()", "Source file does not exist"
End If

'Read the data from the sourcefile
Filenr = FreeFile
Open SourceFile For Binary As #Filenr
ReDim ByteArray(0 To LOF(Filenr) - 1)
Get #Filenr, , ByteArray()
Close #Filenr

'Uncompress the data
Call DecodeByte(ByteArray(), UBound(ByteArray) + 1)

'If the destination file exist we need to
'destroy it because opening it as binary
'will not clear the old data
If (FileExist(DestFile)) Then Kill DestFile

'Save the destination string
Open DestFile For Binary As #Filenr
Put #Filenr, , ByteArray()
Close #Filenr
End Sub
Private Sub CreateTree(Nodes() As HUFFMANTREE, NodesCount As Long, Char As Long, Bytes As ByteArray)
Dim a As Integer
Dim NodeIndex As Long

NodeIndex = 0
For a = 0 To (Bytes.Count - 1)
If (Bytes.Data(a) = 0) Then
  'Left node
  If (Nodes(NodeIndex).LeftNode = -1) Then
  Nodes(NodeIndex).LeftNode = NodesCount
  Nodes(NodesCount).ParentNode = NodeIndex
  Nodes(NodesCount).LeftNode = -1
  Nodes(NodesCount).RightNode = -1
  Nodes(NodesCount).Value = -1
  NodesCount = NodesCount + 1
  End If
  NodeIndex = Nodes(NodeIndex).LeftNode
ElseIf (Bytes.Data(a) = 1) Then
  'Right node
  If (Nodes(NodeIndex).RightNode = -1) Then
  Nodes(NodeIndex).RightNode = NodesCount
  Nodes(NodesCount).ParentNode = NodeIndex
  Nodes(NodesCount).LeftNode = -1
  Nodes(NodesCount).RightNode = -1
  Nodes(NodesCount).Value = -1
  NodesCount = NodesCount + 1
  End If
  NodeIndex = Nodes(NodeIndex).RightNode
Else
  Stop
End If
Next

Nodes(NodeIndex).Value = Char
End Sub
Public Sub EncodeByte(ByteArray() As Byte, ByteLen As Long)
Application.SysCmd acSysCmdInitMeter, "Encoding...", 100
Dim i As Long
Dim j As Long
Dim Char As Byte
Dim BitPos As Byte
Dim lNode1 As Long
Dim lNode2 As Long
Dim lNodes As Long
Dim lLength As Long
Dim Count As Integer
Dim lWeight1 As Long
Dim lWeight2 As Long
Dim Result() As Byte
Dim ByteValue As Byte
Dim ResultLen As Long
Dim Bytes As ByteArray
Dim NodesCount As Integer
Dim NewProgress As Integer
Dim CurrProgress As Integer
Dim BitValue(0 To 7) As Byte
Dim CharCount(0 To 255) As Long
Dim Nodes(0 To 511) As HUFFMANTREE
Dim CharValue(0 To 255) As ByteArray

'If the source string is empty or contains
'only one character we return it uncompressed
'with the prefix string "HEO" & vbCr
If (ByteLen = 0) Then
ReDim Preserve ByteArray(0 To ByteLen + 3)
If (ByteLen > 0) Then
  Call CopyMem(ByteArray(4), ByteArray(0), ByteLen)
End If
ByteArray(0) = 72 '"H"
ByteArray(1) = 69 '"E"
ByteArray(2) = 48 '"0"
ByteArray(3) = 13 'vbCr
Exit Sub
End If

'Create the temporary result array and make
'space for identifier, checksum, textlen and
'the ASCII values inside the Huffman Tree
ReDim Result(0 To 522)

'Prefix the destination string with the
'"HE3" & vbCr identification string
Result(0) = 72
Result(1) = 69
Result(2) = 51
Result(3) = 13
ResultLen = 4

'Count the frequency of each ASCII code
For i = 0 To (ByteLen - 1)
CharCount(ByteArray(i)) = CharCount(ByteArray(i)) + 1
If (i Mod 1000 = 0) Then
  NewProgress = i / ByteLen * PROGRESS_CALCFREQUENCY
  If (NewProgress <> CurrProgress) Then
  CurrProgress = NewProgress
  Application.SysCmd acSysCmdUpdateMeter, CurrProgress
  End If
End If
Next

'Create a leaf for each character
For i = 0 To 255
If (CharCount(i) > 0) Then
  With Nodes(NodesCount)
  .Weight = CharCount(i)
  .Value = i
  .LeftNode = -1
  .RightNode = -1
  .ParentNode = -1
  End With
  NodesCount = NodesCount + 1
End If
Next

'Create the Huffman Tree
For lNodes = NodesCount To 2 Step -1
'Get the two leafs with the smallest weights
lNode1 = -1: lNode2 = -1
For i = 0 To (NodesCount - 1)
  If (Nodes(i).ParentNode = -1) Then
  If (lNode1 = -1) Then
   lWeight1 = Nodes(i).Weight
   lNode1 = i
  ElseIf (lNode2 = -1) Then
   lWeight2 = Nodes(i).Weight
   lNode2 = i
  ElseIf (Nodes(i).Weight < lWeight1) Then
   If (Nodes(i).Weight < lWeight2) Then
   If (lWeight1 < lWeight2) Then
    lWeight2 = Nodes(i).Weight
    lNode2 = i
   Else
    lWeight1 = Nodes(i).Weight
    lNode1 = i
   End If
   Else
   lWeight1 = Nodes(i).Weight
   lNode1 = i
   End If
  ElseIf (Nodes(i).Weight < lWeight2) Then
   lWeight2 = Nodes(i).Weight
   lNode2 = i
  End If
  End If
Next

'Create a new leaf
With Nodes(NodesCount)
  .Weight = lWeight1 + lWeight2
  .LeftNode = lNode1
  .RightNode = lNode2
  .ParentNode = -1
  .Value = -1
End With

'Set the parentnodes of the two leafs
Nodes(lNode1).ParentNode = NodesCount
Nodes(lNode2).ParentNode = NodesCount

'Increase the node counter
NodesCount = NodesCount + 1
Next
'Traverse the tree to get the bit sequence
'for each character, make temporary room in
'the data array to hold max theoretical size
ReDim Bytes.Data(0 To 255)
Call CreateBitSequences(Nodes(), NodesCount - 1, Bytes, CharValue)

'Calculate the length of the destination
'string after encoding
For i = 0 To 255
If (CharCount(i) > 0) Then
  lLength = lLength + CharValue(i).Count * CharCount(i)
End If
Next
lLength = IIf(lLength Mod 8 = 0, lLength 8, lLength 8 + 1)

'If the destination is larger than the source
'string we leave it uncompressed and prefix
'it with a 4 byte header ("HE0" & vbCr)
If ((lLength = 0) Or (lLength > ByteLen)) Then
ReDim Preserve ByteArray(0 To ByteLen + 3)
Call CopyMem(ByteArray(4), ByteArray(0), ByteLen)
ByteArray(0) = 72
ByteArray(1) = 69
ByteArray(2) = 48
ByteArray(3) = 13
Exit Sub
End If

'Add a simple checksum value to the result
'header for corruption identification
Char = 0
For i = 0 To (ByteLen - 1)
Char = Char Xor ByteArray(i)
If (i Mod 10000 = 0) Then
  NewProgress = i / ByteLen * PROGRESS_CALCCRC + PROGRESS_CALCFREQUENCY
  If (NewProgress <> CurrProgress) Then
  CurrProgress = NewProgress
  Application.SysCmd acSysCmdUpdateMeter, CurrProgress
  End If
End If
Next
Result(ResultLen) = Char
ResultLen = ResultLen + 1

'Add the length of the source string to the
'header for corruption identification
Call CopyMem(Result(ResultLen), ByteLen, 4)
ResultLen = ResultLen + 4

'Create a small array to hold the bit values,
'this is faster than calculating on-fly
For i = 0 To 7
BitValue(i) = 2 ^ i
Next

'Store the number of characters used
Count = 0
For i = 0 To 255
If (CharValue(i).Count > 0) Then
  Count = Count + 1
End If
Next
Call CopyMem(Result(ResultLen), Count, 2)
ResultLen = ResultLen + 2

'Store the used characters and the length
'of their respective bit sequences
Count = 0
For i = 0 To 255
If (CharValue(i).Count > 0) Then
  Result(ResultLen) = i
  ResultLen = ResultLen + 1
  Result(ResultLen) = CharValue(i).Count
  ResultLen = ResultLen + 1
  Count = Count + 16 + CharValue(i).Count
End If
Next

'Make room for the Huffman Tree in the
'destination byte array
ReDim Preserve Result(0 To ResultLen + Count 8)

'Store the Huffman Tree into the result
'converting the bit sequences into bytes
BitPos = 0
ByteValue = 0
For i = 0 To 255
With CharValue(i)
  If (.Count > 0) Then
  For j = 0 To (.Count - 1)
   If (.Data(j)) Then ByteValue = ByteValue + BitValue(BitPos)
   BitPos = BitPos + 1
   If (BitPos = 8) Then
   Result(ResultLen) = ByteValue
   ResultLen = ResultLen + 1
   ByteValue = 0
   BitPos = 0
   End If
  Next
  End If
End With
Next
If (BitPos > 0) Then
Result(ResultLen) = ByteValue
ResultLen = ResultLen + 1
End If

'Resize the destination string to be able to
'contain the encoded string
ReDim Preserve Result(0 To ResultLen - 1 + lLength)

'Now we can encode the data by exchanging each
'ASCII byte for its appropriate bit string.
Char = 0
BitPos = 0
For i = 0 To (ByteLen - 1)
With CharValue(ByteArray(i))
  For j = 0 To (.Count - 1)
  If (.Data(j) = 1) Then Char = Char + BitValue(BitPos)
  BitPos = BitPos + 1
  If (BitPos = 8) Then
   Result(ResultLen) = Char
   ResultLen = ResultLen + 1
   BitPos = 0
   Char = 0
  End If
  Next
End With
If (i Mod 10000 = 0) Then
  NewProgress = i / ByteLen * PROGRESS_ENCODING + PROGRESS_CALCCRC + PROGRESS_CALCFREQUENCY
  If (NewProgress <> CurrProgress) Then
  CurrProgress = NewProgress
  Application.SysCmd acSysCmdUpdateMeter, CurrProgress
  End If
End If
Next
'Add the last byte
If (BitPos > 0) Then
Result(ResultLen) = Char
ResultLen = ResultLen + 1
End If

'Return the destination in string format
ReDim ByteArray(0 To ResultLen - 1)
Call CopyMem(ByteArray(0), Result(0), ResultLen)
'Make sure we get a "100%" progress message
If (CurrProgress <> 100) Then
Application.SysCmd acSysCmdUpdateMeter, 100
End If
Application.SysCmd acSysCmdClearStatus
End Sub
Public Function DecodeString(Text As String) As String

Dim ByteArray() As Byte

'Convert the string to a byte array
ByteArray() = StrConv(Text, vbFromUnicode)

'Compress the byte array
Call DecodeByte(ByteArray, Len(Text))

'Convert the compressed byte array to a string
DecodeString = StrConv(ByteArray(), vbUnicode)

End Function
Public Function EncodeString(Text As String) As String

Dim ByteArray() As Byte

'Convert the string to a byte array
ByteArray() = StrConv(Text, vbFromUnicode)

'Compress the byte array
Call EncodeByte(ByteArray, Len(Text))

'Convert the compressed byte array to a string
EncodeString = StrConv(ByteArray(), vbUnicode)

End Function
Public Sub DecodeByte(ByteArray() As Byte, ByteLen As Long)
Application.SysCmd acSysCmdInitMeter, "Decoding...", 100
Dim i As Long
Dim j As Long
Dim Pos As Long
Dim Char As Byte
Dim CurrPos As Long
Dim Count As Integer
Dim CheckSum As Byte
Dim Result() As Byte
Dim BitPos As Integer
Dim NodeIndex As Long
Dim ByteValue As Byte
Dim ResultLen As Long
Dim NodesCount As Long
Dim lResultLen As Long
Dim NewProgress As Integer
Dim CurrProgress As Integer
Dim BitValue(0 To 7) As Byte
Dim Nodes(0 To 511) As HUFFMANTREE
Dim CharValue(0 To 255) As ByteArray

If (ByteArray(0) <> 72) Or (ByteArray(1) <> 69) Or (ByteArray(3) <> 13) Then
'The source did not contain the identification
'string "HE?" & vbCr where ? is undefined at
'the moment (does not matter)
ElseIf (ByteArray(2) = 48) Then
'The text is uncompressed, return the substring
'Decode = Mid$(Text, 5)
Call CopyMem(ByteArray(0), ByteArray(4), ByteLen - 4)
ReDim Preserve ByteArray(0 To ByteLen - 5)
Exit Sub
ElseIf (ByteArray(2) <> 51) Then
'This is not a Huffman encoded string
Err.Raise vbObjectError, "HuffmanDecode()", "The data either was not compressed with HE3 or is corrupt (identification string not found)"
Exit Sub
End If

CurrPos = 5

'Extract the checksum
CheckSum = ByteArray(CurrPos - 1)
CurrPos = CurrPos + 1

'Extract the length of the original string
Call CopyMem(ResultLen, ByteArray(CurrPos - 1), 4)
CurrPos = CurrPos + 4
lResultLen = ResultLen

'If the compressed string is empty we can
'skip the function right here
If (ResultLen = 0) Then Exit Sub

'Create the result array
ReDim Result(0 To ResultLen - 1)

'Get the number of characters used
Call CopyMem(Count, ByteArray(CurrPos - 1), 2)
CurrPos = CurrPos + 2

'Get the used characters and their
'respective bit sequence lengths
For i = 1 To Count
With CharValue(ByteArray(CurrPos - 1))
  CurrPos = CurrPos + 1
  .Count = ByteArray(CurrPos - 1)
  CurrPos = CurrPos + 1
  ReDim .Data(0 To .Count - 1)
End With
Next

'Create a small array to hold the bit values,
'this is (still) faster than calculating on-fly
For i = 0 To 7
BitValue(i) = 2 ^ i
Next

'Extract the Huffman Tree, converting the
'byte sequence to bit sequences
ByteValue = ByteArray(CurrPos - 1)
CurrPos = CurrPos + 1
BitPos = 0
For i = 0 To 255
With CharValue(i)
  If (.Count > 0) Then
  For j = 0 To (.Count - 1)
   If (ByteValue And BitValue(BitPos)) Then .Data(j) = 1
   BitPos = BitPos + 1
   If (BitPos = 8) Then
   ByteValue = ByteArray(CurrPos - 1)
   CurrPos = CurrPos + 1
   BitPos = 0
   End If
  Next
  End If
End With
Next
If (BitPos = 0) Then CurrPos = CurrPos - 1

'Create the Huffman Tree
NodesCount = 1
Nodes(0).LeftNode = -1
Nodes(0).RightNode = -1
Nodes(0).ParentNode = -1
Nodes(0).Value = -1
For i = 0 To 255
Call CreateTree(Nodes(), NodesCount, i, CharValue(i))
Next

'Decode the actual data
ResultLen = 0
For CurrPos = CurrPos To ByteLen
ByteValue = ByteArray(CurrPos - 1)
For BitPos = 0 To 7
  If (ByteValue And BitValue(BitPos)) Then
  NodeIndex = Nodes(NodeIndex).RightNode
  Else
  NodeIndex = Nodes(NodeIndex).LeftNode
  End If
  If (Nodes(NodeIndex).Value > -1) Then
  Result(ResultLen) = Nodes(NodeIndex).Value
  ResultLen = ResultLen + 1
  If (ResultLen = lResultLen) Then GoTo DecodeFinished
  NodeIndex = 0
  End If
Next
If (CurrPos Mod 10000 = 0) Then
  NewProgress = CurrPos / ByteLen * PROGRESS_DECODING
  If (NewProgress <> CurrProgress) Then
  CurrProgress = NewProgress
  Application.SysCmd acSysCmdUpdateMeter, CurrProgress
    Application.SysCmd acSysCmdUpdateMeter, CurrProgress
  End If
End If
Next
DecodeFinished:
'Verify data to check for corruption.
Char = 0
For i = 0 To (ResultLen - 1)
Char = Char Xor Result(i)
If (i Mod 10000 = 0) Then
  NewProgress = i / ResultLen * PROGRESS_CHECKCRC + PROGRESS_DECODING
  If (NewProgress <> CurrProgress) Then
  CurrProgress = NewProgress
  Application.SysCmd acSysCmdUpdateMeter, CurrProgress
  End If
End If
Next
If (Char <> CheckSum) Then
Err.Raise vbObjectError, "clsHuffman.Decode()", "The data might be corrupted (checksum did not match expected value)"
End If
'Return the uncompressed string
ReDim ByteArray(0 To ResultLen - 1)
Call CopyMem(ByteArray(0), Result(0), ResultLen)

'Make sure we get a "100%" progress message
If (CurrProgress <> 100) Then
  Application.SysCmd acSysCmdUpdateMeter, 100
End If
Application.SysCmd acSysCmdClearStatus
End Sub
Private Sub CreateBitSequences(Nodes() As HUFFMANTREE, ByVal NodeIndex As Integer, Bytes As ByteArray, CharValue() As ByteArray)
Dim NewBytes As ByteArray

'If this is a leaf we set the characters bit
'sequence in the CharValue array
If (Nodes(NodeIndex).Value > -1) Then
CharValue(Nodes(NodeIndex).Value) = Bytes
Exit Sub
End If

'Traverse the left child
If (Nodes(NodeIndex).LeftNode > -1) Then
NewBytes = Bytes
NewBytes.Data(NewBytes.Count) = 0
NewBytes.Count = NewBytes.Count + 1
Call CreateBitSequences(Nodes(), Nodes(NodeIndex).LeftNode, NewBytes, CharValue)
End If

'Traverse the right child
If (Nodes(NodeIndex).RightNode > -1) Then
NewBytes = Bytes
NewBytes.Data(NewBytes.Count) = 1
NewBytes.Count = NewBytes.Count + 1
Call CreateBitSequences(Nodes(), Nodes(NodeIndex).RightNode, NewBytes, CharValue)
End If

End Sub
Private Function FileExist(Filename As String) As Boolean
On Error GoTo FileDoesNotExist

Call FileLen(Filename)
FileExist = True
Exit Function

FileDoesNotExist:
FileExist = False

End Function

About this post

Posted: 2002-06-01
By: ArchiveBot
Viewed: 214 times

Categories

Visual Basic 6

Attachments

No attachments for this post


Loading Comments ...

Comments

No comments have been added for this post.

You must be logged in to make a comment.