--- /dev/null
+\r
+; match.asm -- Pentium-Pro optimized version of longest_match()\r
+;\r
+; Updated for zlib 1.1.3 and converted to MASM 6.1x\r
+; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>\r
+; and Chuck Walbourn <chuckw@kinesoft.com>\r
+; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>\r
+;\r
+; This is free software; you can redistribute it and/or modify it\r
+; under the terms of the GNU General Public License.\r
+\r
+; Based on match.S\r
+; Written for zlib 1.1.2\r
+; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>\r
+;\r
+; Modified by Gilles Vollant (2005) for add gzhead and gzindex\r
+\r
+ .686P\r
+ .MODEL FLAT\r
+\r
+;===========================================================================\r
+; EQUATES\r
+;===========================================================================\r
+\r
+MAX_MATCH EQU 258\r
+MIN_MATCH EQU 3\r
+MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)\r
+MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))\r
+\r
+;===========================================================================\r
+; STRUCTURES\r
+;===========================================================================\r
+\r
+; This STRUCT assumes a 4-byte alignment\r
+\r
+DEFLATE_STATE STRUCT\r
+ds_strm dd ?\r
+ds_status dd ?\r
+ds_pending_buf dd ?\r
+ds_pending_buf_size dd ?\r
+ds_pending_out dd ?\r
+ds_pending dd ?\r
+ds_wrap dd ?\r
+; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)\r
+ds_gzhead dd ?\r
+ds_gzindex dd ?\r
+ds_data_type db ?\r
+ds_method db ?\r
+ db ? ; padding\r
+ db ? ; padding\r
+ds_last_flush dd ?\r
+ds_w_size dd ? ; used\r
+ds_w_bits dd ?\r
+ds_w_mask dd ? ; used\r
+ds_window dd ? ; used\r
+ds_window_size dd ?\r
+ds_prev dd ? ; used\r
+ds_head dd ?\r
+ds_ins_h dd ?\r
+ds_hash_size dd ?\r
+ds_hash_bits dd ?\r
+ds_hash_mask dd ?\r
+ds_hash_shift dd ?\r
+ds_block_start dd ?\r
+ds_match_length dd ? ; used\r
+ds_prev_match dd ? ; used\r
+ds_match_available dd ?\r
+ds_strstart dd ? ; used\r
+ds_match_start dd ? ; used\r
+ds_lookahead dd ? ; used\r
+ds_prev_length dd ? ; used\r
+ds_max_chain_length dd ? ; used\r
+ds_max_laxy_match dd ?\r
+ds_level dd ?\r
+ds_strategy dd ?\r
+ds_good_match dd ? ; used\r
+ds_nice_match dd ? ; used\r
+\r
+; Don't need anymore of the struct for match\r
+DEFLATE_STATE ENDS\r
+\r
+;===========================================================================\r
+; CODE\r
+;===========================================================================\r
+_TEXT SEGMENT\r
+\r
+;---------------------------------------------------------------------------\r
+; match_init\r
+;---------------------------------------------------------------------------\r
+ ALIGN 4\r
+PUBLIC _match_init\r
+_match_init PROC\r
+ ; no initialization needed\r
+ ret\r
+_match_init ENDP\r
+\r
+;---------------------------------------------------------------------------\r
+; uInt longest_match(deflate_state *deflatestate, IPos curmatch)\r
+;---------------------------------------------------------------------------\r
+ ALIGN 4\r
+\r
+PUBLIC _longest_match\r
+_longest_match PROC\r
+\r
+; Since this code uses EBP for a scratch register, the stack frame must\r
+; be manually constructed and referenced relative to the ESP register.\r
+\r
+; Stack image\r
+; Variables\r
+chainlenwmask = 0 ; high word: current chain len\r
+ ; low word: s->wmask\r
+window = 4 ; local copy of s->window\r
+windowbestlen = 8 ; s->window + bestlen\r
+scanend = 12 ; last two bytes of string\r
+scanstart = 16 ; first two bytes of string\r
+scanalign = 20 ; dword-misalignment of string\r
+nicematch = 24 ; a good enough match size\r
+bestlen = 28 ; size of best match so far\r
+scan = 32 ; ptr to string wanting match\r
+varsize = 36 ; number of bytes (also offset to last saved register)\r
+\r
+; Saved Registers (actually pushed into place)\r
+ebx_save = 36\r
+edi_save = 40\r
+esi_save = 44\r
+ebp_save = 48\r
+\r
+; Parameters\r
+retaddr = 52\r
+deflatestate = 56\r
+curmatch = 60\r
+\r
+; Save registers that the compiler may be using\r
+ push ebp\r
+ push edi\r
+ push esi\r
+ push ebx\r
+\r
+; Allocate local variable space\r
+ sub esp,varsize\r
+\r
+; Retrieve the function arguments. ecx will hold cur_match\r
+; throughout the entire function. edx will hold the pointer to the\r
+; deflate_state structure during the function's setup (before\r
+; entering the main loop).\r
+\r
+ mov edx, [esp+deflatestate]\r
+ASSUME edx:PTR DEFLATE_STATE\r
+\r
+ mov ecx, [esp+curmatch]\r
+\r
+; uInt wmask = s->w_mask;\r
+; unsigned chain_length = s->max_chain_length;\r
+; if (s->prev_length >= s->good_match) {\r
+; chain_length >>= 2;\r
+; }\r
+\r
+ mov eax, [edx].ds_prev_length\r
+ mov ebx, [edx].ds_good_match\r
+ cmp eax, ebx\r
+ mov eax, [edx].ds_w_mask\r
+ mov ebx, [edx].ds_max_chain_length\r
+ jl SHORT LastMatchGood\r
+ shr ebx, 2\r
+LastMatchGood:\r
+\r
+; chainlen is decremented once beforehand so that the function can\r
+; use the sign flag instead of the zero flag for the exit test.\r
+; It is then shifted into the high word, to make room for the wmask\r
+; value, which it will always accompany.\r
+\r
+ dec ebx\r
+ shl ebx, 16\r
+ or ebx, eax\r
+ mov [esp+chainlenwmask], ebx\r
+\r
+; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;\r
+\r
+ mov eax, [edx].ds_nice_match\r
+ mov ebx, [edx].ds_lookahead\r
+ cmp ebx, eax\r
+ jl SHORT LookaheadLess\r
+ mov ebx, eax\r
+LookaheadLess:\r
+ mov [esp+nicematch], ebx\r
+\r
+;/* register Bytef *scan = s->window + s->strstart; */\r
+\r
+ mov esi, [edx].ds_window\r
+ mov [esp+window], esi\r
+ mov ebp, [edx].ds_strstart\r
+ lea edi, [esi+ebp]\r
+ mov [esp+scan],edi\r
+\r
+;/* Determine how many bytes the scan ptr is off from being */\r
+;/* dword-aligned. */\r
+\r
+ mov eax, edi\r
+ neg eax\r
+ and eax, 3\r
+ mov [esp+scanalign], eax\r
+\r
+;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */\r
+;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */\r
+\r
+ mov eax, [edx].ds_w_size\r
+ sub eax, MIN_LOOKAHEAD\r
+ sub ebp, eax\r
+ jg SHORT LimitPositive\r
+ xor ebp, ebp\r
+LimitPositive:\r
+\r
+;/* int best_len = s->prev_length; */\r
+\r
+ mov eax, [edx].ds_prev_length\r
+ mov [esp+bestlen], eax\r
+\r
+;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */\r
+\r
+ add esi, eax\r
+ mov [esp+windowbestlen], esi\r
+\r
+;/* register ush scan_start = *(ushf*)scan; */\r
+;/* register ush scan_end = *(ushf*)(scan+best_len-1); */\r
+;/* Posf *prev = s->prev; */\r
+\r
+ movzx ebx, WORD PTR[edi]\r
+ mov [esp+scanstart], ebx\r
+ movzx ebx, WORD PTR[eax+edi-1]\r
+ mov [esp+scanend], ebx\r
+ mov edi, [edx].ds_prev\r
+\r
+;/* Jump into the main loop. */\r
+\r
+ mov edx, [esp+chainlenwmask]\r
+ jmp SHORT LoopEntry\r
+\r
+;/* do {\r
+; * match = s->window + cur_match;\r
+; * if (*(ushf*)(match+best_len-1) != scan_end ||\r
+; * *(ushf*)match != scan_start) continue;\r
+; * [...]\r
+; * } while ((cur_match = prev[cur_match & wmask]) > limit\r
+; * && --chain_length != 0);\r
+; *\r
+; * Here is the inner loop of the function. The function will spend the\r
+; * majority of its time in this loop, and majority of that time will\r
+; * be spent in the first ten instructions.\r
+; *\r
+; * Within this loop:\r
+; * %ebx = scanend\r
+; * %ecx = curmatch\r
+; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)\r
+; * %esi = windowbestlen - i.e., (window + bestlen)\r
+; * %edi = prev\r
+; * %ebp = limit\r
+; */\r
+\r
+ ALIGN 4\r
+LookupLoop:\r
+ and ecx, edx\r
+ movzx ecx, WORD PTR[edi+ecx*2]\r
+ cmp ecx, ebp\r
+ jbe LeaveNow\r
+ sub edx, 000010000H\r
+ js LeaveNow\r
+\r
+LoopEntry:\r
+ movzx eax, WORD PTR[esi+ecx-1]\r
+ cmp eax, ebx\r
+ jnz SHORT LookupLoop\r
+\r
+ mov eax, [esp+window]\r
+ movzx eax, WORD PTR[eax+ecx]\r
+ cmp eax, [esp+scanstart]\r
+ jnz SHORT LookupLoop\r
+\r
+;/* Store the current value of chainlen. */\r
+\r
+ mov [esp+chainlenwmask], edx\r
+\r
+;/* Point %edi to the string under scrutiny, and %esi to the string we */\r
+;/* are hoping to match it up with. In actuality, %esi and %edi are */\r
+;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */\r
+;/* initialized to -(MAX_MATCH_8 - scanalign). */\r
+\r
+ mov esi, [esp+window]\r
+ mov edi, [esp+scan]\r
+ add esi, ecx\r
+ mov eax, [esp+scanalign]\r
+ mov edx, -MAX_MATCH_8\r
+ lea edi, [edi+eax+MAX_MATCH_8]\r
+ lea esi, [esi+eax+MAX_MATCH_8]\r
+\r
+;/* Test the strings for equality, 8 bytes at a time. At the end,\r
+; * adjust %edx so that it is offset to the exact byte that mismatched.\r
+; *\r
+; * We already know at this point that the first three bytes of the\r
+; * strings match each other, and they can be safely passed over before\r
+; * starting the compare loop. So what this code does is skip over 0-3\r
+; * bytes, as much as necessary in order to dword-align the %edi\r
+; * pointer. (%esi will still be misaligned three times out of four.)\r
+; *\r
+; * It should be confessed that this loop usually does not represent\r
+; * much of the total running time. Replacing it with a more\r
+; * straightforward "rep cmpsb" would not drastically degrade\r
+; * performance.\r
+; */\r
+\r
+LoopCmps:\r
+ mov eax, DWORD PTR[esi+edx]\r
+ xor eax, DWORD PTR[edi+edx]\r
+ jnz SHORT LeaveLoopCmps\r
+\r
+ mov eax, DWORD PTR[esi+edx+4]\r
+ xor eax, DWORD PTR[edi+edx+4]\r
+ jnz SHORT LeaveLoopCmps4\r
+\r
+ add edx, 8\r
+ jnz SHORT LoopCmps\r
+ jmp LenMaximum\r
+ ALIGN 4\r
+\r
+LeaveLoopCmps4:\r
+ add edx, 4\r
+\r
+LeaveLoopCmps:\r
+ test eax, 00000FFFFH\r
+ jnz SHORT LenLower\r
+\r
+ add edx, 2\r
+ shr eax, 16\r
+\r
+LenLower:\r
+ sub al, 1\r
+ adc edx, 0\r
+\r
+;/* Calculate the length of the match. If it is longer than MAX_MATCH, */\r
+;/* then automatically accept it as the best possible match and leave. */\r
+\r
+ lea eax, [edi+edx]\r
+ mov edi, [esp+scan]\r
+ sub eax, edi\r
+ cmp eax, MAX_MATCH\r
+ jge SHORT LenMaximum\r
+\r
+;/* If the length of the match is not longer than the best match we */\r
+;/* have so far, then forget it and return to the lookup loop. */\r
+\r
+ mov edx, [esp+deflatestate]\r
+ mov ebx, [esp+bestlen]\r
+ cmp eax, ebx\r
+ jg SHORT LongerMatch\r
+ mov esi, [esp+windowbestlen]\r
+ mov edi, [edx].ds_prev\r
+ mov ebx, [esp+scanend]\r
+ mov edx, [esp+chainlenwmask]\r
+ jmp LookupLoop\r
+ ALIGN 4\r
+\r
+;/* s->match_start = cur_match; */\r
+;/* best_len = len; */\r
+;/* if (len >= nice_match) break; */\r
+;/* scan_end = *(ushf*)(scan+best_len-1); */\r
+\r
+LongerMatch:\r
+ mov ebx, [esp+nicematch]\r
+ mov [esp+bestlen], eax\r
+ mov [edx].ds_match_start, ecx\r
+ cmp eax, ebx\r
+ jge SHORT LeaveNow\r
+ mov esi, [esp+window]\r
+ add esi, eax\r
+ mov [esp+windowbestlen], esi\r
+ movzx ebx, WORD PTR[edi+eax-1]\r
+ mov edi, [edx].ds_prev\r
+ mov [esp+scanend], ebx\r
+ mov edx, [esp+chainlenwmask]\r
+ jmp LookupLoop\r
+ ALIGN 4\r
+\r
+;/* Accept the current string, with the maximum possible length. */\r
+\r
+LenMaximum:\r
+ mov edx, [esp+deflatestate]\r
+ mov DWORD PTR[esp+bestlen], MAX_MATCH\r
+ mov [edx].ds_match_start, ecx\r
+\r
+;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */\r
+;/* return s->lookahead; */\r
+\r
+LeaveNow:\r
+ mov edx, [esp+deflatestate]\r
+ mov ebx, [esp+bestlen]\r
+ mov eax, [edx].ds_lookahead\r
+ cmp ebx, eax\r
+ jg SHORT LookaheadRet\r
+ mov eax, ebx\r
+LookaheadRet:\r
+\r
+; Restore the stack and return from whence we came.\r
+\r
+ add esp, varsize\r
+ pop ebx\r
+ pop esi\r
+ pop edi\r
+ pop ebp\r
+ ret\r
+\r
+_longest_match ENDP\r
+\r
+_TEXT ENDS\r
+END\r