@@ -92,20 +92,20 @@ static int64_t matchlen(u_char *old, int64_t old_size, u_char *new,
9292 * Finds the longest matching array of bytes between the OLD and NEW file. The
9393 * old file is suffix-sorted; the suffix-sorted array is stored at I, and
9494 * indices to search between are indicated by ST (start) and EN (end). The
95- * function does not return a value, but once a match is determined, POS is
95+ * function does not return a value, but once a match is determined, OLD_POS is
9696 * updated to the position of the match within OLD, and MAX_LEN is set to the
9797 * match length.
9898 */
9999static void search (int64_t * I , u_char * old , int64_t old_size ,
100100 u_char * new , int64_t new_size , int64_t st , int64_t en ,
101- int64_t * pos , int64_t * max_len )
101+ int64_t * old_pos , int64_t * max_len )
102102{
103103 int64_t x , y ;
104104
105105 /* Initialize max_len for the binary search */
106106 if (st == 0 && en == old_size ) {
107107 * max_len = matchlen (old , old_size , new , new_size );
108- * pos = I [st ];
108+ * old_pos = I [st ];
109109 }
110110
111111 /* The binary search terminates here when "en" and "st" are adjacent
@@ -114,12 +114,12 @@ static void search(int64_t *I, u_char *old, int64_t old_size,
114114 x = matchlen (old + I [st ], old_size - I [st ], new , new_size );
115115 if (x > * max_len ) {
116116 * max_len = x ;
117- * pos = I [st ];
117+ * old_pos = I [st ];
118118 }
119119 y = matchlen (old + I [en ], old_size - I [en ], new , new_size );
120120 if (y > * max_len ) {
121121 * max_len = y ;
122- * pos = I [en ];
122+ * old_pos = I [en ];
123123 }
124124
125125 return ;
@@ -134,14 +134,14 @@ static void search(int64_t *I, u_char *old, int64_t old_size,
134134 int64_t tmp = matchlen (oldoffset , length , new , length );
135135 if (tmp > * max_len ) {
136136 * max_len = tmp ;
137- * pos = I [x ];
137+ * old_pos = I [x ];
138138 }
139139
140140 /* Determine how to continue the binary search */
141141 if (memcmp (oldoffset , new , length ) < 0 ) {
142- return search (I , old , old_size , new , new_size , x , en , pos , max_len );
142+ return search (I , old , old_size , new , new_size , x , en , old_pos , max_len );
143143 } else {
144- return search (I , old , old_size , new , new_size , st , x , pos , max_len );
144+ return search (I , old , old_size , new , new_size , st , x , old_pos , max_len );
145145 }
146146}
147147
@@ -388,14 +388,6 @@ int make_bsdiff_delta(char *old_filename, char *new_filename, char *delta_filena
388388 u_char * old_data , * new_data ;
389389 int64_t old_size , new_size ;
390390 int64_t * I , * V ;
391- int64_t scan ;
392- int64_t pos = 0 ;
393- int64_t len ;
394- int64_t lastscan , lastpos , lastoffset ;
395- int64_t oldscore , scsc ;
396- int64_t s , Sf , lenf , Sb , lenb ;
397- int64_t overlap , Ss , lens ;
398- int64_t i ;
399391 uint64_t cblen , dblen , eblen ;
400392 u_char * cb , * db , * eb ;
401393 struct stat new_stat ;
@@ -607,103 +599,139 @@ int make_bsdiff_delta(char *old_filename, char *new_filename, char *delta_filena
607599 eblen = 0 ;
608600
609601 /* Compute the differences */
610- scan = 0 ;
611- len = 0 ;
612- lastscan = 0 ;
613- lastpos = 0 ;
614- lastoffset = 0 ;
615- while (scan < new_size ) {
616- oldscore = 0 ;
617-
618- for (scsc = scan += len ; scan < new_size ; scan ++ ) {
619- search (I , old_data , old_size , new_data + scan , new_size - scan ,
620- 0 , old_size , & pos , & len );
621-
622- for (; scsc < scan + len ; scsc ++ ) {
623- if ((scsc + lastoffset < old_size ) &&
624- (old_data [scsc + lastoffset ] == new_data [scsc ])) {
625- oldscore ++ ;
602+ int64_t new_pos = 0 ;
603+ int64_t old_pos = 0 ;
604+ int64_t match_len = 0 ;
605+ int64_t last_new_pos = 0 ;
606+ int64_t last_old_pos = 0 ;
607+ int64_t last_offset = 0 ;
608+ while (new_pos < new_size ) {
609+ // Find an exact match between old and new files, and require
610+ // that more than 8 of the matching bytes "mismatch" from the
611+ // previous exact match. A score (old_score) is used to track
612+ // how many bytes match starting from new_pos in new, and from
613+ // old_pos in the previous iteration.
614+ int64_t old_score = 0 ;
615+ int64_t new_peek ;
616+ for (new_peek = new_pos += match_len ; new_pos < new_size ; new_pos ++ ) {
617+ search (I , old_data , old_size , new_data + new_pos , new_size - new_pos ,
618+ 0 , old_size , & old_pos , & match_len );
619+
620+ for (; new_peek < new_pos + match_len ; new_peek ++ ) {
621+ if ((new_peek + last_offset < old_size ) &&
622+ (old_data [new_peek + last_offset ] == new_data [new_peek ])) {
623+ old_score ++ ;
626624 }
627625 }
628626
629- if (((len == oldscore ) && (len != 0 )) ||
630- (len > oldscore + 8 )) {
627+ if (((match_len == old_score ) && (match_len != 0 )) ||
628+ (match_len > old_score + 8 )) {
631629 break ;
632630 }
633631
634- if ((scan + lastoffset < old_size ) &&
635- (old_data [scan + lastoffset ] == new_data [scan ])) {
636- oldscore -- ;
632+ // Before beginning the next loop iteration, decrement
633+ // old_score if needed, since new_pos will be
634+ // incremented.
635+ if ((new_pos + last_offset < old_size ) &&
636+ (old_data [new_pos + last_offset ] == new_data [new_pos ])) {
637+ old_score -- ;
637638 }
638639 }
639640
640- if ((len != oldscore ) || (scan == new_size )) {
641- s = 0 ;
642- Sf = 0 ;
643- lenf = 0 ;
644- for (i = 0 ;
645- (lastscan + i < scan ) && (lastpos + i < old_size );) {
646- if (old_data [lastpos + i ] == new_data [lastscan + i ]) {
647- s ++ ;
641+ if ((match_len != old_score ) || (new_pos == new_size )) {
642+ int64_t bytes = 0 , max = 0 ;
643+ // Compute the length of a fuzzy match starting from
644+ // the beginning of the fuzzy match recorded at the end
645+ // of the previous iteration (i.e. len_fuzzybackward
646+ // less than the previous match positions). At least
647+ // half of the bytes match between old and new. This
648+ // fuzzy match will be used to construct a diff string
649+ // in the diff block.
650+ int64_t len_fuzzyforward = 0 ;
651+ for (int64_t i = 0 ;
652+ (last_new_pos + i < new_pos ) && (last_old_pos + i < old_size );) {
653+ if (old_data [last_old_pos + i ] == new_data [last_new_pos + i ]) {
654+ bytes ++ ;
648655 }
649656 i ++ ;
650- if (s * 2 - i > Sf * 2 - lenf ) {
651- Sf = s ;
652- lenf = i ;
657+ if (bytes * 2 - i > max * 2 - len_fuzzyforward ) {
658+ max = bytes ;
659+ len_fuzzyforward = i ;
653660 }
654661 }
655662
656- lenb = 0 ;
657- if (scan < new_size ) {
658- s = 0 ;
659- Sb = 0 ;
660- for (i = 1 ;
661- (scan >= lastscan + i ) && (pos >= i );
663+ // Compute the length of a fuzzy match ending at the
664+ // current positions in old and new files (old_pos and
665+ // new_pos). At least half of the bytes match between
666+ // old and new. This fuzzy match will be used for the
667+ // next iteration.
668+ int64_t len_fuzzybackward = 0 ;
669+ if (new_pos < new_size ) {
670+ bytes = 0 ;
671+ max = 0 ;
672+ for (int64_t i = 1 ;
673+ (new_pos >= last_new_pos + i ) && (old_pos >= i );
662674 i ++ ) {
663- if (old_data [pos - i ] == new_data [scan - i ]) {
664- s ++ ;
675+ if (old_data [old_pos - i ] == new_data [new_pos - i ]) {
676+ bytes ++ ;
665677 }
666- if (s * 2 - i > Sb * 2 - lenb ) {
667- Sb = s ;
668- lenb = i ;
678+ if (bytes * 2 - i > max * 2 - len_fuzzybackward ) {
679+ max = bytes ;
680+ len_fuzzybackward = i ;
669681 }
670682 }
671683 }
672684
673- if (lastscan + lenf > scan - lenb ) {
674- overlap = (lastscan + lenf ) - (scan - lenb );
675- s = 0 ;
676- Ss = 0 ;
677- lens = 0 ;
678- for (i = 0 ; i < overlap ; i ++ ) {
679- if (new_data [lastscan + lenf - overlap + i ] ==
680- old_data [lastpos + lenf - overlap + i ]) {
681- s ++ ;
685+ // If there is an overlap between len_fuzzyforward and
686+ // len_fuzzybackward in the new file, that overlap must
687+ // be eliminated.
688+ if (last_new_pos + len_fuzzyforward > new_pos - len_fuzzybackward ) {
689+ bytes = 0 ;
690+ max = 0 ;
691+ int64_t overlap = (last_new_pos + len_fuzzyforward ) - (new_pos - len_fuzzybackward );
692+ int64_t len_fuzzyshift = 0 ;
693+ // Scan the overlap area for differences
694+ // between old and new. If any mismatching
695+ // bytes are found, extend len_fuzzyforward to
696+ // cover those bytes, because we want them
697+ // included in the diff block.
698+ for (int64_t i = 0 ; i < overlap ; i ++ ) {
699+ if (new_data [last_new_pos + len_fuzzyforward - overlap + i ] ==
700+ old_data [last_old_pos + len_fuzzyforward - overlap + i ]) {
701+ bytes ++ ;
682702 }
683- if (new_data [scan - lenb + i ] ==
684- old_data [pos - lenb + i ]) {
685- s -- ;
703+ if (new_data [new_pos - len_fuzzybackward + i ] ==
704+ old_data [old_pos - len_fuzzybackward + i ]) {
705+ bytes -- ;
686706 }
687- if (s > Ss ) {
688- Ss = s ;
689- lens = i + 1 ;
707+ if (bytes > max ) {
708+ max = bytes ;
709+ len_fuzzyshift = i + 1 ;
690710 }
691711 }
692712
693- lenf += lens - overlap ;
694- lenb -= lens ;
713+ len_fuzzyforward += len_fuzzyshift - overlap ;
714+ len_fuzzybackward -= len_fuzzyshift ;
695715 }
696716
697- for (i = 0 ; i < lenf ; i ++ ) {
717+ // Set the diff string in the diff block. For each byte
718+ // in the fuzzy forward region, the byte from old is
719+ // subtracted from new. When applying the delta (with
720+ // bspatch) this operation is reversed, by performing
721+ // additions.
722+ for (int64_t i = 0 ; i < len_fuzzyforward ; i ++ ) {
698723 db [dblen + i ] =
699- new_data [lastscan + i ] - old_data [lastpos + i ];
724+ new_data [last_new_pos + i ] - old_data [last_old_pos + i ];
700725 }
701- for (i = 0 ; i < (scan - lenb ) - (lastscan + lenf ); i ++ ) {
702- eb [eblen + i ] = new_data [lastscan + lenf + i ];
726+ // Set the extra string in the extra block. The
727+ // contents are the bytes in new file between the fuzzy
728+ // forward and fuzzy backward regions.
729+ for (int64_t i = 0 ; i < (new_pos - len_fuzzybackward ) - (last_new_pos + len_fuzzyforward ); i ++ ) {
730+ eb [eblen + i ] = new_data [last_new_pos + len_fuzzyforward + i ];
703731 }
704732
705- dblen += lenf ;
706- eblen += (scan - lenb ) - (lastscan + lenf );
733+ dblen += len_fuzzyforward ;
734+ eblen += (new_pos - len_fuzzybackward ) - (last_new_pos + len_fuzzyforward );
707735
708736 /* checking for control block overflow...
709737 * See regression test #15 for an example */
@@ -717,18 +745,28 @@ int make_bsdiff_delta(char *old_filename, char *new_filename, char *delta_filena
717745 return -1 ;
718746 }
719747
720- offtout (lenf , cb + cblen );
748+ // Set three values in the control block:
749+ // 1. ADD instruction (value: length of the diff
750+ // string). It uses the offset of the third control
751+ // block value from the previous iteration.
752+ // 2. INSERT instruction (value: length of the extra
753+ // string)
754+ // 3. offset in old file for the next ADD instruction
755+ offtout (len_fuzzyforward , cb + cblen );
721756 cblen += 8 ;
722757
723- offtout ((scan - lenb ) - (lastscan + lenf ), cb + cblen );
758+ offtout ((new_pos - len_fuzzybackward ) - (last_new_pos + len_fuzzyforward ), cb + cblen );
724759 cblen += 8 ;
725760
726- offtout ((pos - lenb ) - (lastpos + lenf ), cb + cblen );
761+ offtout ((old_pos - len_fuzzybackward ) - (last_old_pos + len_fuzzyforward ), cb + cblen );
727762 cblen += 8 ;
728763
729- lastscan = scan - lenb ;
730- lastpos = pos - lenb ;
731- lastoffset = pos - scan ;
764+ // Save old/new file positions to the beginning of the
765+ // fuzzy backward region, since the next fuzzy forward
766+ // region will be calculated from that point.
767+ last_new_pos = new_pos - len_fuzzybackward ;
768+ last_old_pos = old_pos - len_fuzzybackward ;
769+ last_offset = old_pos - new_pos ;
732770 }
733771 }
734772 free (I );
0 commit comments