Dictionary-based Encoding for Large Dictionaries
Compression Efficiency Calculation
Statistical-based Versus Dictionary-based Compressions
A full implementation on real images can be found here
def lz77_encode(data, search_buffer_size=4096):
encoded = []
i = 0
while i < len(data):
max_match_length = 0
max_match_distance = 0
next_symbol = ''
# Adjust search buffer to include the character for the current token
buffer_start = max(0, i - search_buffer_size)
# Search for the longest and nearest match within the buffer
for j in range(i-1, buffer_start-1, -1):
match_length = 0
while i + match_length < len(data) and data[j + match_length] == data[i + match_length]:
match_length += 1
if j + match_length >= i or i + match_length == len(data): # Stop conditions
break
if match_length > max_match_length or (match_length == max_match_length and i - j < max_match_distance):
max_match_length = match_length
max_match_distance = i - j
if i + max_match_length < len(data):
next_symbol = data[i + max_match_length]
encoded.append((max_match_distance, max_match_length, next_symbol))
# Calculate the position after encoding the current token for accurate buffer
i += max_match_length + 1
# Ensure the search buffer now reflects the inclusion of the token's character
buffer_start = max(0, i - search_buffer_size)
search_buffer = data[buffer_start:i] # Corrected to reflect current token character
data_to_be_encoded = data[i:] # The rest of the data to encode
# Print the current search buffer and the remaining data to be encoded
print(f"Token: {encoded[-1]}")
print(f"Search Buffer ('{search_buffer}') | Data to be Encoded: '{data_to_be_encoded}'")
print("----")
return encoded
# Re-encode with updated logic
example_data = "sir sid eastman easily teases sea sick seal"
encoded_data_longest_nearest = lz77_encode(example_data)
Output:
def lz77_decode(encoded_data):
decoded = ""
for offset, length, next_symbol in encoded_data:
# Start by finding the position in the decoded string to copy from
start_pos = len(decoded) - offset
if start_pos < 0:
raise ValueError("Invalid offset in token, cannot decode.")
# Copy the matched string from the decoded data itself
for i in range(length):
if start_pos + i < len(decoded):
decoded += decoded[start_pos + i]
else:
break # In case length exceeds the current decoded string size
# Append the next symbol to the decoded string
decoded += next_symbol
# Print the current state of decoding
print(f"Token: {offset}, {length}, '{next_symbol}')")
print(f"Buffer: '{decoded}'")
print("----")
return decoded
decoded_data = lz77_decode(encoded_data_longest_nearest)
print("Decoded data:", decoded_data)
Output:
Decoded data: sir sid eastman easily teases sea sick seal
def lz78_encoding(input_string):
# Initialize the dictionary with a NULL entry
dictionary = {0: 'NULL'}
dict_index = 1 # Start dictionary indices from 1 as 0 is reserved for NULL
# The list to hold the tokens produced by the LZ78 encoding
tokens = []
current_string = ''
# Process the input string character by character
for char in input_string:
# Check if the current string plus the new character is in the dictionary
if current_string + char not in dictionary.values():
# If the current string is empty, the index is 0, otherwise find the index
index = 0 if current_string == '' else list(dictionary.keys())[list(dictionary.values()).index(current_string)]
# Append the token (index, new character) to the tokens list
tokens.append((index, char))
# Add the new string to the dictionary and increment the index
dictionary[dict_index] = current_string + char
dict_index += 1
# Reset current string to empty
current_string = ''
else:
# If the string is in the dictionary, concatenate the character to the current string
current_string += char
# Handling the last token if the current_string is not empty
if current_string:
index = list(dictionary.keys())[list(dictionary.values()).index(current_string)]
tokens.append((index, ''))
return dictionary, tokens
# The input string for LZ78 encoding
input_string = "sir sid eastman easily teases sea sick seal"
# Perform the LZ78 encoding
dict_result, tokens_result = lz78_encoding(input_string)
# Print the tokens
print("Tokens:")
for token in tokens_result:
print(f"{token},")
# Print the dictionary entries
print("\nDictionary Entries:")
for key, value in dict_result.items():
print(f"{key}: '{value}'")
Output:
Tokens: (0, ‘s’), (0, ‘i’), (0, ‘r’), (0, ‘ ‘), (1, ‘i’), (0, ‘d’), (4, ‘e’), (0, ‘a’), (1, ‘t’), (0, ‘m’), (8, ‘n’), (7, ‘a’), (5, ‘l’), (0, ‘y’), (4, ‘t’), (0, ‘e’), (8, ‘s’), (16, ‘s’), (4, ‘s’), (16, ‘a’), (19, ‘i’), (0, ‘c’), (0, ‘k’), (19, ‘e’), (8, ‘l’),
Dictionary Entries: 0: ‘NULL’ 1: ‘s’ 2: ‘i’ 3: ‘r’ 4: ‘ ‘ 5: ‘si’ 6: ‘d’ 7: ‘ e’ 8: ‘a’ 9: ‘st’ 10: ‘m’ 11: ‘an’ 12: ‘ ea’ 13: ‘sil’ 14: ‘y’ 15: ‘ t’ 16: ‘e’ 17: ‘as’ 18: ‘es’ 19: ‘ s’ 20: ‘ea’ 21: ‘ si’ 22: ‘c’ 23: ‘k’ 24: ‘ se’ 25: ‘al’
or
def lz78_decoding(tokens):
# Initialize the dictionary with a NULL entry
dictionary = {0: ''}
decoded_string = ''
# Process the tokens to rebuild the dictionary and the decoded string
for index, char in tokens:
# If the index is in the dictionary, get the string and add the character to it
if index in dictionary:
entry = dictionary[index] + char
else:
entry = char
# Add the entry to the dictionary
dictionary[len(dictionary)] = entry
# Add the entry to the decoded string
decoded_string += entry
return decoded_string
# The tokens from the LZ78 encoding
tokens = [
(0, 's'),
(0, 'i'),
(0, 'r'),
(0, ' '),
(1, 'i'),
(0, 'd'),
(4, 'e'),
(0, 'a'),
(1, 't'),
(0, 'm'),
(8, 'n'),
(7, 'a'),
(5, 'l'),
(0, 'y'),
(4, 't'),
(0, 'e'),
(8, 's'),
(16, 's'),
(4, 's'),
(16, 'a'),
(19, 'i'),
(0, 'c'),
(0, 'k'),
(19, 'e'),
(8, 'l')
]
# Perform the LZ78 decoding
decoded_string = lz78_decoding(tokens)
print(decoded_string)
Output: sir sid eastman easily teases sea sick seal
{0, 1, 2, ..., 255}
.def lzw_encoding(input_string):
# Initialize the dictionary with single characters (ASCII values)
dictionary = {i: chr(i) for i in range(256)}
dict_size = 256
current_string = ''
tokens = []
# Process the input string character by character
for char in input_string:
# Form a new string by concatenating the current string and the new character
new_string = current_string + char
if new_string in dictionary.values():
# If the new string is already in the dictionary, set it as the current string
current_string = new_string
else:
# If current_string is not empty, add it to tokens
if current_string:
# Adjust the index for tokens greater than 255
index = list(dictionary.keys())[list(dictionary.values()).index(current_string)]
tokens.append(index if index < 256 else index + 1)
# Add the new string to the dictionary and increment dict_size
dictionary[dict_size] = new_string
dict_size += 1
# Reset current_string to the current char as it's not in the dictionary
current_string = char
# Append the last token if current_string is not empty
if current_string:
index = list(dictionary.keys())[list(dictionary.values()).index(current_string)]
tokens.append(index if index < 256 else index + 1)
return dictionary, tokens
# Perform LZW encoding on the given string
input_string = "sir sid eastman easily teases sea sick seal"
dict_result, tokens_result = lzw_encoding(input_string)
# Print out the tokens, adjusted for indices greater than 255
for token in tokens_result:
print(token)
# Print the new dictionary entries, adjusted to start at index 257
print("\nNew dictionary entries (starting from 257):")
new_dict_entries = {k + 1: v for k, v in dict_result.items() if k >= 256}
for key, value in new_dict_entries.items():
print(f"{key}: '{value}'")
Output:
115 105 114 32 257 100 32 101 97 115 116 109 97 110 263 265 105 108 121 32 116 264 115 101 115 260 264 260 105 99 107 282 97 108
New dictionary entries (starting from 257): 257: ‘si’ 258: ‘ir’ 259: ‘r ‘ 260: ‘ s’ 261: ‘sid’ 262: ‘d ‘ 263: ‘ e’ 264: ‘ea’ 265: ‘as’ 266: ‘st’ 267: ‘tm’ 268: ‘ma’ 269: ‘an’ 270: ‘n ‘ 271: ‘ ea’ 272: ‘asi’ 273: ‘il’ 274: ‘ly’ 275: ‘y ‘ 276: ‘ t’ 277: ‘te’ 278: ‘eas’ 279: ‘se’ 280: ‘es’ 281: ‘s ‘ 282: ‘ se’ 283: ‘ea ‘ 284: ‘ si’ 285: ‘ic’ 286: ‘ck’ 287: ‘k ‘ 288: ‘ sea’ 289: ‘al’
or
Decode Function: TODO