实验3-计算机考试-空麓网

一、实验目的

掌握词典编码的基本原理，用C/C++/Python等语言编程实现LZW解码器并分析编解码算法。

二、实验内容

1．LZW编码原理和实现算法

LZW的编码思想是不断地从字符流中提取新的字符串，通俗地理解为新“词条”，然后用“代号”也就是码字表示这个“词条”。这样一来，对字符流的编码就变成了用码字去替换字符流，生成码字流，从而达到压缩数据的目的。LZW编码是围绕称为词典的转换表来完成的。LZW编码器通过管理这个词典完成输入与输出之间的转换。LZW编码器的输入是字符流，字符流可以是用8位ASCII字符组成的字符串，而输出是用n位(例如12位)表示的码字流。

LZW编码算法的步骤如下：

步骤1：将词典初始化为包含所有可能的单字符，当前前缀P初始化为空。

步骤2：当前字符C=字符流中的下一个字符。

步骤3：判断P＋C是否在词典中

（1）如果“是”，则用C扩展P，即让P=P＋C，返回到步骤2。

（2）如果“否”，则输出与当前前缀P相对应的码字W；将P＋C添加到词典中；令P=C，并返回到步骤2

LZW编码算法可用下述函数实现。首先初始化词典，然后顺序从待压缩文件中读入字符并按照上述算法执行编码。最后将编得的码字流输出至文件中。

未选择任何文件
```c
void LZWEncode( FILE *fp, BITFILE *bf){
	int character;	//存储当前新读取的单字符C
	int string_code;	//存放最终编码得到的码字并输出
	int index;	//判断新读取的P+C对应的词典索引号
	unsigned long file_length;

	fseek( fp, 0, SEEK_END);   //指针移动到fp文件末尾
	file_length = ftell( fp); //获取当前指针，打印出指针位置，获取文件大小
	fseek( fp, 0, SEEK_SET); //指针移动到fp文件开始
	BitsOutput( bf, file_length, 4*8); 
	InitDictionary();
	string_code = -1;
	while( EOF!=(character=fgetc( fp))) //没有读完整个文件时
	{ 
		index = InDictionary( character, string_code); //index为P+C对应的索引号
		if( 0<=index)
		{	// string+character in dictionary
			string_code = index; //在字典中，则nextP=P+C
		}
		else
		{	// string+character not in dictionary
			output( bf, string_code); //不在词典中，输出P对应的索引号
			if( MAX_CODE > next_code)
			{	// free space in dictionary
				// add string+character to dictionary
				AddToDictionary( character, string_code); //将P+C加入词典
			}
			string_code = character; //nextP=C;
		}
	}
	output( bf, string_code); //最后一次读取，只有P，没有C，将P的索引号输出
}

2．LZW解码原理和实现算法

步骤1：在开始译码时词典包含所有可能的前缀根。

步骤2：令CW：=码字流中的第一个码字。

步骤3：输出当前缀-符串string.CW到码字流。

步骤4：先前码字PW：=当前码字CW。

步骤5：当前码字CW：=码字流的下一个码字。

步骤6：判断当前缀-符串string.CW 是否在词典中。

（1）如果”是”，则把当前缀-符串string.CW输出到字符流。当前前缀P：=先前缀-符串string.PW。当前字符C：=当前前缀-符串string.CW的第一个字符。把缀-符串P+C添加到词典。

（2）如果”否”，则当前前缀P：=先前缀-符串string.PW。当前字符C：=当前缀-符串string.CW的第一个字符。输出缀-符串P+C到字符流,然后把它添加到词典中。

步骤7：判断码字流中是否还有码字要译。

（1）如果”是”，就返回步骤4。

（2）如果”否”，结束。

LZW解码算法可用下述函数实现。首先初始化词典，然后顺序从压缩文件中读入码字并按照上述算法执行解码。最后将解得的字符串输出至文件中。

void LZWDecode( BITFILE *bf, FILE *fp){
	//需填充
	int character; 
	int new_code, last_code; //new_code对应新读取的cw；last_code对应pw
	int phrase_length; //需要解码的字符的位数
	unsigned long file_length;
	file_length = BitsInput(bf, 4 * 8); //需要解码的字符流长度
	if (-1 == file_length) file_length = 0;
	InitDictionary();
	last_code = -1; //第一次解码没有pw，所以为-1
	while (0 < file_length) //未完全解码时
	{
		new_code = input(bf);
		if (new_code >= next_code) //判断cw是否在词典内 
		{  
			d_stack[0] = character; //如果不在词典中，那么它一定是由【pw+pw的第一位】构成的
			//（也就是在编码时，刚刚编码出来就立刻使用）所以可以直接将character赋值给堆的第一位
			phrase_length = DecodeString(1, last_code);//得到pw（上一个刚刚解出的符号）的长度
		}
		else  
		{//若在词典中，则直接得到要解码字符的位数
			phrase_length = DecodeString(0, new_code);
		}
		character = d_stack[phrase_length - 1]; //因为编码时堆是倒序存放的，因此该操作是将cw的第一位赋值给character
		while (0 < phrase_length)  //遍历解码字符
		{
			phrase_length--;
			fputc(d_stack[phrase_length], fp); //将解码后的字符写入fp流中
			file_length--;
		}
		if (MAX_CODE > next_code) 
		{ // add the new phrase to dictionary 
			AddToDictionary(character, last_code);
		}
		last_code = new_code; //nextpw=cw
	}
}

三、实验步骤

1.调试LZW编码程序

以一个文本文件作为输入，得到输出的LZW编码文件

2 .编写LZW的解码程序

将code对应字符串放置于d_stack栈数组中。
返回字符总数，即要输出字符串的长度

int DecodeString( int start, int code){
    int count;  //数组下标
    count = start;
    while( 0<=code){  //code为-1时，到达树根，循环结束
        d_stack[count] = dictionary[code].suffix;  
        code = dictionary[code].parent;  //节点上移至母节点处
        count ++;  //数组下标增加1
    }
    return count;
}

3.使用LZW编码器压缩不同格式类型文件

选取10种不同格式类型文件，使用LZW编码器进行压缩。

压缩后文件大小比压缩前大，应该是文件中的重复短语还未到达一定数量，使得其编码后大小不降反增。若使文本内容重复率较高，使数据冗余，这样的文本文件压缩效率比较高。对于一些已经压缩的文件类型(例如png、jpg、gif文件)，由于其压缩编码已经文件中大部分相关性去除，严重影响LZW的压缩效果。

问题：当前码字在词典中不存在时应如何处理并解释原因。
原因：编码的进度和解码的进度不同，当编码生成了一个码字，马上解码端就要用这个码字，这种情况下，当前码字在字典中不存在。
处理方法：将上一个解出的符号再加上该符号序列的第一个字符输出。

四、代码

lzw.c: 核心文件，定义编解码相关函数

#define _CRT_SECURE_NO_WARNINGS 
#include 
#include 
#include "bitio.h"
#define MAX_CODE 65535

// 定义字典
struct {
	int suffix;  //尾缀字符
	int parent, firstchild, nextsibling; //母节点；第一个孩子节点；下一个兄弟节点
} dictionary[MAX_CODE + 1];
int next_code;
int d_stack[MAX_CODE]; // stack for decoding a phrase 用于解码

#define input(f) ((int)BitsInput( f, 16))
#define output(f, x) BitsOutput( f, (unsigned long)(x), 16)

int DecodeString(int start, int code);
void InitDictionary(void);
void PrintDictionary(void) {
	int n;
	int count;
	for (n = 256; n < next_code; n++) {
		count = DecodeString(0, n);
		printf("%4d->", n);
		while (0 < count--) printf("%c", (char)(d_stack[count]));
		printf("n");
	}
}

int DecodeString(int start, int code) {
	int count;
	count = start;
	while (0 <= code) {
		d_stack[count] = dictionary[code].suffix;
		code = dictionary[code].parent;
		count++;
	}
	return count;
}
// 词典初始化
void InitDictionary(void) {  
	int i;
	//初始化根节点
	for (i = 0; i < 256; i++) {
		dictionary[i].suffix = i; //将尾缀字符设置为单字符对应的ACSII码值
		dictionary[i].parent = -1; //根节点没有母节点
		dictionary[i].firstchild = -1; //暂时没有第一个孩子节点
		dictionary[i].nextsibling = i + 1; //下一个兄弟节点

	}
	dictionary[255].nextsibling = -1; //最后一个根节点没有兄弟节点
	next_code = 256;//新码字从256开始
}

int InDictionary(int character, int string_code) {
	int sibling;
	//判断是否为单个字符
	if (0 > string_code) return character;
	sibling = dictionary[string_code].firstchild; //找第一个孩子节点
	while (-1 < sibling) { //遍历所有孩子节点
		//如果character==该节点的ASCII码值
		if (character == dictionary[sibling].suffix) return sibling; 
		sibling = dictionary[sibling].nextsibling;
	}
	return -1;
}

//将(string_code, character)添加到字典中
void AddToDictionary(int character, int string_code) {
	int firstsibling, nextsibling;
	//判断是否为单个字符
	if (0 > string_code) return;
	
	dictionary[next_code].suffix = character;//将尾缀字符设置为character
	dictionary[next_code].parent = string_code;//新节点的母亲节点为该前缀
	dictionary[next_code].nextsibling = -1; //兄弟节点初始化-1
	dictionary[next_code].firstchild = -1; //第一个孩子节点初始化-1

	//找到新节点的母节点的第一个孩子节点
	firstsibling = dictionary[string_code].firstchild;
	if (-1 < firstsibling) {	// the parent has child
		nextsibling = firstsibling;
		//循环找到最后一个兄弟
		while (-1 < dictionary[nextsibling].nextsibling)
			nextsibling = dictionary[nextsibling].nextsibling;
		//把新节点设为母节点最后一个孩子的兄弟
		dictionary[nextsibling].nextsibling = next_code;
	}
	else {// no child before, modify it to be the first
		//把新节点设为母节点的第一个孩子
		dictionary[string_code].firstchild = next_code;
	}
	next_code++;
}

void LZWEncode(FILE* fp, BITFILE* bf) {
	int character;
	int string_code;
	int index;
	unsigned long file_length;

	fseek(fp, 0, SEEK_END);  //指针定位到输入文件最后
	file_length = ftell(fp);   //获取输入文件大小
	fseek(fp, 0, SEEK_SET);  //指针定位到输入文件开始
	BitsOutput(bf, file_length, 4 * 8);//将输入文件的大小写入输出文件中
	InitDictionary();     //初始化字典
	string_code = -1;    //初始化前缀
	while (EOF != (character = fgetc(fp))) {  //扫描输入文件，得到每个字符

		
		index = InDictionary(character, string_code);
		if (0 <= index) {	// string+character in dictionary
			string_code = index;
		}
		else {	// string+character not in dictionary
			output(bf, string_code); //输出前缀
			if (MAX_CODE > next_code) {	// free space in dictionary
				// add string+character to dictionary 
				//将(string_code，character)添加到字典中
				AddToDictionary(character, string_code);
			}
			//将新字符做为前缀
			string_code = character;
		}
	}
	output(bf, string_code);//输入文件扫描完毕，将最后未输出的前缀输出
}

void LZWDecode(BITFILE* bf, FILE* fp) {
	int character=0;
	int new_code, last_code;
	int phrase_length;
	unsigned long file_length;

	file_length = BitsInput(bf, 4 * 8);
	if (-1 == file_length) file_length = 0;
    //初始化字典，使字典内容与ASCII字符一一对应
	InitDictionary();
	last_code = -1;
	while (0 < file_length) {
        //读入一个编号
		new_code = input(bf);
		if (new_code >= next_code) { //如果不在字典中
			d_stack[0] = character;
			phrase_length = DecodeString(1, last_code); //对字符解码
		}
		else {//如果在字典中
			phrase_length = DecodeString(0, new_code);//直接对字符解码
		}
		character = d_stack[phrase_length - 1];//更新下一个字符
		while (0 < phrase_length) {
			phrase_length--;
			fputc(d_stack[phrase_length], fp);
			file_length--;
		}
		if (MAX_CODE > next_code) {// add the new phrase to dictionary
			AddToDictionary(character, last_code);
		}
		last_code = new_code;
	}
}



int main(int argc, char** argv) {
	FILE* fp;
	BITFILE* bf;

	if (4 > argc) {
		fprintf(stdout, "usage: n%s   n", argv[0]);
		fprintf(stdout, "t: E or D reffers encode or decoden");
		fprintf(stdout, "t: input file namen");
		fprintf(stdout, "t: output file namen");
		return -1;
	}
	if ('E' == argv[1][0]) { // do encoding
		fp = fopen(argv[2], "rb");
		bf = OpenBitFileOutput(argv[3]);
		if (NULL != fp && NULL != bf) {
			LZWEncode(fp, bf);
			fclose(fp);
			CloseBitFileOutput(bf);
			fprintf(stdout, "encoding donen");
		}
	}
	else if ('D' == argv[1][0]) {	// do decoding
		bf = OpenBitFileInput(argv[2]);
		fp = fopen(argv[3], "wb");
		if (NULL != fp && NULL != bf) {
			LZWDecode(bf, fp);
			fclose(fp);
			CloseBitFileInput(bf);
			fprintf(stdout, "decoding donen");
		}
	}
	else {	// otherwise
		fprintf(stderr, "not supported operationn");
	}
	return 0;
}

bitio.c: 定义用于读写码字的函数

#define _CRT_SECURE_NO_WARNINGS 
#include 
#include 
#include "bitio.h"
BITFILE* OpenBitFileInput(char* filename) {
	BITFILE* bf;
	bf = (BITFILE*)malloc(sizeof(BITFILE));
	if (NULL == bf) return NULL;
	if (NULL == filename)	bf->fp = stdin;
	else bf->fp = fopen(filename, "rb");
	if (NULL == bf->fp) return NULL;
	bf->mask = 0x80;
	bf->rack = 0;
	return bf;
}

BITFILE* OpenBitFileOutput(char* filename) {
	BITFILE* bf;
	bf = (BITFILE*)malloc(sizeof(BITFILE));
	if (NULL == bf) return NULL;
	if (NULL == filename)	bf->fp = stdout;
	else bf->fp = fopen(filename, "wb");
	if (NULL == bf->fp) return NULL;
	bf->mask = 0x80;
	bf->rack = 0;
	return bf;
}

void CloseBitFileInput(BITFILE* bf) {
	fclose(bf->fp);
	free(bf);
}

void CloseBitFileOutput(BITFILE* bf) {
	// Output the remaining bits
	if (0x80 != bf->mask) fputc(bf->rack, bf->fp);
	fclose(bf->fp);
	free(bf);
}

int BitInput(BITFILE* bf) {
	int value;

	if (0x80 == bf->mask) {
		bf->rack = fgetc(bf->fp);
		if (EOF == bf->rack) {
			fprintf(stderr, "Read after the end of file reachedn");
			exit(-1);
		}
	}
	value = bf->mask & bf->rack;
	bf->mask >>= 1;
	if (0 == bf->mask) bf->mask = 0x80;
	return((0 == value) ? 0 : 1);
}

unsigned long BitsInput(BITFILE* bf, int count) {
	unsigned long mask;
	unsigned long value;
	mask = 1L << (count - 1);
	value = 0L;
	while (0 != mask) {
		if (1 == BitInput(bf))
			value |= mask;
		mask >>= 1;
	}
	return value;
}

void BitOutput(BITFILE* bf, int bit) {
	if (0 != bit) bf->rack |= bf->mask;
	bf->mask >>= 1;
	if (0 == bf->mask) {	// eight bits in rack
		fputc(bf->rack, bf->fp);
		bf->rack = 0;
		bf->mask = 0x80;
	}
}

void BitsOutput(BITFILE* bf, unsigned long code, int count) {
	unsigned long mask;

	mask = 1L << (count - 1);
	while (0 != mask) {
		BitOutput(bf, (int)(0 == (code & mask) ? 0 : 1));
		mask >>= 1;
	}
}

bitio.h: 声明函数的信息

#ifndef __BITIO__
#define __BITIO__

#include 

typedef struct {
	FILE* fp; //输出文件指针
	unsigned char mask; //按位写入字节，掩码
	int rack;  //类似于缓存，每写完8位，将rack输出到文件中
}BITFILE;

BITFILE* OpenBitFileInput(char* filename);
BITFILE* OpenBitFileOutput(char* filename);
void CloseBitFileInput(BITFILE* bf);
void CloseBitFileOutput(BITFILE* bf);
int BitInput(BITFILE* bf);
unsigned long BitsInput(BITFILE* bf, int count);
void BitOutput(BITFILE* bf, int bit);
void BitsOutput(BITFILE* bf, unsigned long code, int count);
#endif	// __BITIO__

实验3

C/C++/C#相关栏目本月热门文章