match686.asm 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. ; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
  2. ; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
  3. ; File written by Gilles Vollant, by converting match686.S from Brian Raiter
  4. ; for MASM. This is as assembly version of longest_match
  5. ; from Jean-loup Gailly in deflate.c
  6. ;
  7. ; http://www.zlib.net
  8. ; http://www.winimage.com/zLibDll
  9. ; http://www.muppetlabs.com/~breadbox/software/assembly.html
  10. ;
  11. ; For Visual C++ 4.x and higher and ML 6.x and higher
  12. ; ml.exe is distributed in
  13. ; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
  14. ;
  15. ; this file contain two implementation of longest_match
  16. ;
  17. ; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
  18. ; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
  19. ;
  20. ; for using an assembly version of longest_match, you need define ASMV in project
  21. ;
  22. ; compile the asm file running
  23. ; ml /coff /Zi /c /Flmatch686.lst match686.asm
  24. ; and do not include match686.obj in your project
  25. ;
  26. ; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
  27. ; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
  28. ; with autoselect (with cpu detection code)
  29. ; if you want support the old pentium optimization, you can still use these version
  30. ;
  31. ; this file is not optimized for old pentium, but it compatible with all x86 32 bits
  32. ; processor (starting 80386)
  33. ;
  34. ;
  35. ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
  36. ;uInt longest_match(s, cur_match)
  37. ; deflate_state *s;
  38. ; IPos cur_match; /* current match */
  39. NbStack equ 76
  40. cur_match equ dword ptr[esp+NbStack-0]
  41. str_s equ dword ptr[esp+NbStack-4]
  42. ; 5 dword on top (ret,ebp,esi,edi,ebx)
  43. adrret equ dword ptr[esp+NbStack-8]
  44. pushebp equ dword ptr[esp+NbStack-12]
  45. pushedi equ dword ptr[esp+NbStack-16]
  46. pushesi equ dword ptr[esp+NbStack-20]
  47. pushebx equ dword ptr[esp+NbStack-24]
  48. chain_length equ dword ptr [esp+NbStack-28]
  49. limit equ dword ptr [esp+NbStack-32]
  50. best_len equ dword ptr [esp+NbStack-36]
  51. window equ dword ptr [esp+NbStack-40]
  52. prev equ dword ptr [esp+NbStack-44]
  53. scan_start equ word ptr [esp+NbStack-48]
  54. wmask equ dword ptr [esp+NbStack-52]
  55. match_start_ptr equ dword ptr [esp+NbStack-56]
  56. nice_match equ dword ptr [esp+NbStack-60]
  57. scan equ dword ptr [esp+NbStack-64]
  58. windowlen equ dword ptr [esp+NbStack-68]
  59. match_start equ dword ptr [esp+NbStack-72]
  60. strend equ dword ptr [esp+NbStack-76]
  61. NbStackAdd equ (NbStack-24)
  62. .386p
  63. name gvmatch
  64. .MODEL FLAT
  65. ; all the +zlib1222add offsets are due to the addition of fields
  66. ; in zlib in the deflate_state structure since the asm code was first written
  67. ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
  68. ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
  69. ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
  70. zlib1222add equ 8
  71. ; Note : these value are good with a 8 bytes boundary pack structure
  72. dep_chain_length equ 74h+zlib1222add
  73. dep_window equ 30h+zlib1222add
  74. dep_strstart equ 64h+zlib1222add
  75. dep_prev_length equ 70h+zlib1222add
  76. dep_nice_match equ 88h+zlib1222add
  77. dep_w_size equ 24h+zlib1222add
  78. dep_prev equ 38h+zlib1222add
  79. dep_w_mask equ 2ch+zlib1222add
  80. dep_good_match equ 84h+zlib1222add
  81. dep_match_start equ 68h+zlib1222add
  82. dep_lookahead equ 6ch+zlib1222add
  83. _TEXT segment
  84. IFDEF NOUNDERLINE
  85. public longest_match
  86. public match_init
  87. ELSE
  88. public _longest_match
  89. public _match_init
  90. ENDIF
  91. MAX_MATCH equ 258
  92. MIN_MATCH equ 3
  93. MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
  94. MAX_MATCH equ 258
  95. MIN_MATCH equ 3
  96. MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
  97. MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
  98. ;;; stack frame offsets
  99. chainlenwmask equ esp + 0 ; high word: current chain len
  100. ; low word: s->wmask
  101. window equ esp + 4 ; local copy of s->window
  102. windowbestlen equ esp + 8 ; s->window + bestlen
  103. scanstart equ esp + 16 ; first two bytes of string
  104. scanend equ esp + 12 ; last two bytes of string
  105. scanalign equ esp + 20 ; dword-misalignment of string
  106. nicematch equ esp + 24 ; a good enough match size
  107. bestlen equ esp + 28 ; size of best match so far
  108. scan equ esp + 32 ; ptr to string wanting match
  109. LocalVarsSize equ 36
  110. ; saved ebx byte esp + 36
  111. ; saved edi byte esp + 40
  112. ; saved esi byte esp + 44
  113. ; saved ebp byte esp + 48
  114. ; return address byte esp + 52
  115. deflatestate equ esp + 56 ; the function arguments
  116. curmatch equ esp + 60
  117. ;;; Offsets for fields in the deflate_state structure. These numbers
  118. ;;; are calculated from the definition of deflate_state, with the
  119. ;;; assumption that the compiler will dword-align the fields. (Thus,
  120. ;;; changing the definition of deflate_state could easily cause this
  121. ;;; program to crash horribly, without so much as a warning at
  122. ;;; compile time. Sigh.)
  123. dsWSize equ 36+zlib1222add
  124. dsWMask equ 44+zlib1222add
  125. dsWindow equ 48+zlib1222add
  126. dsPrev equ 56+zlib1222add
  127. dsMatchLen equ 88+zlib1222add
  128. dsPrevMatch equ 92+zlib1222add
  129. dsStrStart equ 100+zlib1222add
  130. dsMatchStart equ 104+zlib1222add
  131. dsLookahead equ 108+zlib1222add
  132. dsPrevLen equ 112+zlib1222add
  133. dsMaxChainLen equ 116+zlib1222add
  134. dsGoodMatch equ 132+zlib1222add
  135. dsNiceMatch equ 136+zlib1222add
  136. ;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
  137. ;;; Written for zlib 1.1.2
  138. ;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
  139. ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
  140. ;;;
  141. ;;
  142. ;; This software is provided 'as-is', without any express or implied
  143. ;; warranty. In no event will the authors be held liable for any damages
  144. ;; arising from the use of this software.
  145. ;;
  146. ;; Permission is granted to anyone to use this software for any purpose,
  147. ;; including commercial applications, and to alter it and redistribute it
  148. ;; freely, subject to the following restrictions:
  149. ;;
  150. ;; 1. The origin of this software must not be misrepresented; you must not
  151. ;; claim that you wrote the original software. If you use this software
  152. ;; in a product, an acknowledgment in the product documentation would be
  153. ;; appreciated but is not required.
  154. ;; 2. Altered source versions must be plainly marked as such, and must not be
  155. ;; misrepresented as being the original software
  156. ;; 3. This notice may not be removed or altered from any source distribution.
  157. ;;
  158. ;GLOBAL _longest_match, _match_init
  159. ;SECTION .text
  160. ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
  161. ;_longest_match:
  162. IFDEF NOUNDERLINE
  163. longest_match proc near
  164. ELSE
  165. _longest_match proc near
  166. ENDIF
  167. .FPO (9, 4, 0, 0, 1, 0)
  168. ;;; Save registers that the compiler may be using, and adjust esp to
  169. ;;; make room for our stack frame.
  170. push ebp
  171. push edi
  172. push esi
  173. push ebx
  174. sub esp, LocalVarsSize
  175. ;;; Retrieve the function arguments. ecx will hold cur_match
  176. ;;; throughout the entire function. edx will hold the pointer to the
  177. ;;; deflate_state structure during the function's setup (before
  178. ;;; entering the main loop.
  179. mov edx, [deflatestate]
  180. mov ecx, [curmatch]
  181. ;;; uInt wmask = s->w_mask;
  182. ;;; unsigned chain_length = s->max_chain_length;
  183. ;;; if (s->prev_length >= s->good_match) {
  184. ;;; chain_length >>= 2;
  185. ;;; }
  186. mov eax, [edx + dsPrevLen]
  187. mov ebx, [edx + dsGoodMatch]
  188. cmp eax, ebx
  189. mov eax, [edx + dsWMask]
  190. mov ebx, [edx + dsMaxChainLen]
  191. jl LastMatchGood
  192. shr ebx, 2
  193. LastMatchGood:
  194. ;;; chainlen is decremented once beforehand so that the function can
  195. ;;; use the sign flag instead of the zero flag for the exit test.
  196. ;;; It is then shifted into the high word, to make room for the wmask
  197. ;;; value, which it will always accompany.
  198. dec ebx
  199. shl ebx, 16
  200. or ebx, eax
  201. mov [chainlenwmask], ebx
  202. ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
  203. mov eax, [edx + dsNiceMatch]
  204. mov ebx, [edx + dsLookahead]
  205. cmp ebx, eax
  206. jl LookaheadLess
  207. mov ebx, eax
  208. LookaheadLess: mov [nicematch], ebx
  209. ;;; register Bytef *scan = s->window + s->strstart;
  210. mov esi, [edx + dsWindow]
  211. mov [window], esi
  212. mov ebp, [edx + dsStrStart]
  213. lea edi, [esi + ebp]
  214. mov [scan], edi
  215. ;;; Determine how many bytes the scan ptr is off from being
  216. ;;; dword-aligned.
  217. mov eax, edi
  218. neg eax
  219. and eax, 3
  220. mov [scanalign], eax
  221. ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
  222. ;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
  223. mov eax, [edx + dsWSize]
  224. sub eax, MIN_LOOKAHEAD
  225. sub ebp, eax
  226. jg LimitPositive
  227. xor ebp, ebp
  228. LimitPositive:
  229. ;;; int best_len = s->prev_length;
  230. mov eax, [edx + dsPrevLen]
  231. mov [bestlen], eax
  232. ;;; Store the sum of s->window + best_len in esi locally, and in esi.
  233. add esi, eax
  234. mov [windowbestlen], esi
  235. ;;; register ush scan_start = *(ushf*)scan;
  236. ;;; register ush scan_end = *(ushf*)(scan+best_len-1);
  237. ;;; Posf *prev = s->prev;
  238. movzx ebx, word ptr [edi]
  239. mov [scanstart], ebx
  240. movzx ebx, word ptr [edi + eax - 1]
  241. mov [scanend], ebx
  242. mov edi, [edx + dsPrev]
  243. ;;; Jump into the main loop.
  244. mov edx, [chainlenwmask]
  245. jmp short LoopEntry
  246. align 4
  247. ;;; do {
  248. ;;; match = s->window + cur_match;
  249. ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
  250. ;;; *(ushf*)match != scan_start) continue;
  251. ;;; [...]
  252. ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
  253. ;;; && --chain_length != 0);
  254. ;;;
  255. ;;; Here is the inner loop of the function. The function will spend the
  256. ;;; majority of its time in this loop, and majority of that time will
  257. ;;; be spent in the first ten instructions.
  258. ;;;
  259. ;;; Within this loop:
  260. ;;; ebx = scanend
  261. ;;; ecx = curmatch
  262. ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
  263. ;;; esi = windowbestlen - i.e., (window + bestlen)
  264. ;;; edi = prev
  265. ;;; ebp = limit
  266. LookupLoop:
  267. and ecx, edx
  268. movzx ecx, word ptr [edi + ecx*2]
  269. cmp ecx, ebp
  270. jbe LeaveNow
  271. sub edx, 00010000h
  272. js LeaveNow
  273. LoopEntry: movzx eax, word ptr [esi + ecx - 1]
  274. cmp eax, ebx
  275. jnz LookupLoop
  276. mov eax, [window]
  277. movzx eax, word ptr [eax + ecx]
  278. cmp eax, [scanstart]
  279. jnz LookupLoop
  280. ;;; Store the current value of chainlen.
  281. mov [chainlenwmask], edx
  282. ;;; Point edi to the string under scrutiny, and esi to the string we
  283. ;;; are hoping to match it up with. In actuality, esi and edi are
  284. ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
  285. ;;; initialized to -(MAX_MATCH_8 - scanalign).
  286. mov esi, [window]
  287. mov edi, [scan]
  288. add esi, ecx
  289. mov eax, [scanalign]
  290. mov edx, 0fffffef8h; -(MAX_MATCH_8)
  291. lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
  292. lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
  293. ;;; Test the strings for equality, 8 bytes at a time. At the end,
  294. ;;; adjust edx so that it is offset to the exact byte that mismatched.
  295. ;;;
  296. ;;; We already know at this point that the first three bytes of the
  297. ;;; strings match each other, and they can be safely passed over before
  298. ;;; starting the compare loop. So what this code does is skip over 0-3
  299. ;;; bytes, as much as necessary in order to dword-align the edi
  300. ;;; pointer. (esi will still be misaligned three times out of four.)
  301. ;;;
  302. ;;; It should be confessed that this loop usually does not represent
  303. ;;; much of the total running time. Replacing it with a more
  304. ;;; straightforward "rep cmpsb" would not drastically degrade
  305. ;;; performance.
  306. LoopCmps:
  307. mov eax, [esi + edx]
  308. xor eax, [edi + edx]
  309. jnz LeaveLoopCmps
  310. mov eax, [esi + edx + 4]
  311. xor eax, [edi + edx + 4]
  312. jnz LeaveLoopCmps4
  313. add edx, 8
  314. jnz LoopCmps
  315. jmp short LenMaximum
  316. LeaveLoopCmps4: add edx, 4
  317. LeaveLoopCmps: test eax, 0000FFFFh
  318. jnz LenLower
  319. add edx, 2
  320. shr eax, 16
  321. LenLower: sub al, 1
  322. adc edx, 0
  323. ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
  324. ;;; then automatically accept it as the best possible match and leave.
  325. lea eax, [edi + edx]
  326. mov edi, [scan]
  327. sub eax, edi
  328. cmp eax, MAX_MATCH
  329. jge LenMaximum
  330. ;;; If the length of the match is not longer than the best match we
  331. ;;; have so far, then forget it and return to the lookup loop.
  332. mov edx, [deflatestate]
  333. mov ebx, [bestlen]
  334. cmp eax, ebx
  335. jg LongerMatch
  336. mov esi, [windowbestlen]
  337. mov edi, [edx + dsPrev]
  338. mov ebx, [scanend]
  339. mov edx, [chainlenwmask]
  340. jmp LookupLoop
  341. ;;; s->match_start = cur_match;
  342. ;;; best_len = len;
  343. ;;; if (len >= nice_match) break;
  344. ;;; scan_end = *(ushf*)(scan+best_len-1);
  345. LongerMatch: mov ebx, [nicematch]
  346. mov [bestlen], eax
  347. mov [edx + dsMatchStart], ecx
  348. cmp eax, ebx
  349. jge LeaveNow
  350. mov esi, [window]
  351. add esi, eax
  352. mov [windowbestlen], esi
  353. movzx ebx, word ptr [edi + eax - 1]
  354. mov edi, [edx + dsPrev]
  355. mov [scanend], ebx
  356. mov edx, [chainlenwmask]
  357. jmp LookupLoop
  358. ;;; Accept the current string, with the maximum possible length.
  359. LenMaximum: mov edx, [deflatestate]
  360. mov dword ptr [bestlen], MAX_MATCH
  361. mov [edx + dsMatchStart], ecx
  362. ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
  363. ;;; return s->lookahead;
  364. LeaveNow:
  365. mov edx, [deflatestate]
  366. mov ebx, [bestlen]
  367. mov eax, [edx + dsLookahead]
  368. cmp ebx, eax
  369. jg LookaheadRet
  370. mov eax, ebx
  371. LookaheadRet:
  372. ;;; Restore the stack and return from whence we came.
  373. add esp, LocalVarsSize
  374. pop ebx
  375. pop esi
  376. pop edi
  377. pop ebp
  378. ret
  379. ; please don't remove this string !
  380. ; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
  381. db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
  382. IFDEF NOUNDERLINE
  383. longest_match endp
  384. ELSE
  385. _longest_match endp
  386. ENDIF
  387. IFDEF NOUNDERLINE
  388. match_init proc near
  389. ret
  390. match_init endp
  391. ELSE
  392. _match_init proc near
  393. ret
  394. _match_init endp
  395. ENDIF
  396. _TEXT ends
  397. end