|
Moodle
2.2.1
http://www.collinsharper.com
|
00001 <?php 00010 // Standard diff 00012 00027 function ouwiki_diff_internal($file1,$file2) { 00028 // Basic variables 00029 $n=count($file2); 00030 $m=count($file1); 00031 00032 // Special-case for empty file2 which otherwise causes error 00033 if($n==0) 00034 { 00035 $result=array(); 00036 for($i=1;$i<=$m;$i++) 00037 { 00038 $result[$i]=0; 00039 } 00040 return $result; 00041 } 00042 00043 // Step 1 Build list of elements 00045 00046 $V=array(); 00047 for($j=1;$j<=$n;$j++) { 00048 $V[$j]=new StdClass; 00049 $V[$j]->serial=$j; 00050 $V[$j]->hash=crc32($file2[$j]); 00051 } 00052 00053 // Step 2 Sort by hash,serial 00055 00056 usort($V,"ouwiki_diff_sort_v"); 00057 00058 // Make it start from 1 again 00059 array_unshift($V,'bogus'); 00060 unset($V[0]); 00061 00062 // $V is now an array including the line number 'serial' and hash 00063 // of each line in file 2, sorted by hash and then serial. 00064 00065 // Step 3 Equivalence classes 00067 00068 $E=array(); 00069 $E[0]=new StdClass; 00070 $E[0]->serial=0; 00071 $E[0]->last=true; 00072 for($j=1;$j<=$n;$j++) { 00073 $E[$j]=new StdClass; 00074 $E[$j]->serial=$V[$j]->serial; 00075 $E[$j]->last=$j===$n || $V[$j]->hash!==$V[$j+1]->hash; 00076 } 00077 00078 // E is now an array sorted the same way as $V which includes 00079 // the line number 'serial' and whether or not that is the 'last' 00080 // line in the given equivalence class, i.e. set of identical lines 00081 00082 // Step 4 For each line in file1, finds start of equivalence class 00084 $P=array(); 00085 for($i=1;$i<=$m;$i++) { 00086 // Find matching last entry from equivalence list 00087 $P[$i]=ouwiki_diff_find_last($V,$E,crc32($file1[$i])); 00088 } 00089 00090 // P is now an array that finds the index (within $V) of the *first* 00091 // matching line in $V (referencing file 2, but not a line number, 00092 // because sorted in $V order) for each line in file 1. In other words 00093 // if you were to start at the P-value in $V and continue through, you 00094 // would find all the lines from file 2 that are equal to the given line 00095 // from file 1. 00096 00097 // Step 5 Initialise vector of candidates 00099 00100 // I do not trust PHP references further than I can throw them (preferably 00101 // at the idiot who came up with the idea) so I am using a separate array 00102 // to store candidates and all references are integers into that. 00103 00104 $candidates=array(); 00105 $candidates[0]=new StdClass; 00106 $candidates[0]->a=0; 00107 $candidates[0]->b=0; 00108 $candidates[0]->previous=null; 00109 $candidates[1]=new StdClass; 00110 $candidates[1]->a=$m+1; 00111 $candidates[1]->b=$n+1; 00112 $candidates[1]->previous=null; 00113 00114 $K=array(); 00115 $K[0]=0; // Ref to candidate 0 00116 $K[1]=1; // Ref to candidate 1 00117 $k=0; 00118 00119 // Step 6 Merge stage 00121 00122 for($i=1;$i<=$m;$i++) { 00123 if($P[$i]!==0) { 00124 ouwiki_diff_merge($K,$k,$i,$E,$P[$i],$candidates); 00125 } 00126 } 00127 00128 // Step 7 00130 00131 $J=array(); 00132 for($i=1;$i<=$m;$i++) { 00133 $J[$i]=0; 00134 } 00135 00136 // Step 8 Follow candidate chain to make nice representation 00138 00139 $index=$K[$k]; 00140 while(!is_null($index)) { 00141 // Stop when we reach the first, dummy candidate 00142 if($candidates[$index]->a!=0) { 00143 $J[$candidates[$index]->a]=$candidates[$index]->b; 00144 } 00145 $index=$candidates[$index]->previous; 00146 } 00147 00148 // Step 9 Get rid of 'jackpots' (hash collisions) 00150 00151 for($i=1;$i<=$m;$i++) { 00152 if($J[$i]!=0 && $file1[$i]!=$file2[$J[$i]]) { 00153 $J[$i]=0; 00154 } 00155 } 00156 00157 // Done! (Maybe.) 00158 return $J; 00159 } 00160 00161 // Functions needed by parts of the algorithm 00163 00164 // Merge, from step 7 (Appendix A.3) 00165 function ouwiki_diff_merge(&$K,&$k,$i,&$E,$p,&$candidates) { 00166 $r=0; 00167 $c=$K[0]; 00168 00169 while(true) { 00170 $j=$E[$p]->serial; // Paper says 'i' but this is wrong (OCR) 00171 00172 // Binary search in $K from $r to $k 00173 $min=$r; 00174 $max=$k+1; 00175 00176 while(true) { 00177 $try = (int)(($min+$max)/2); 00178 if($candidates[$K[$try]]->b >= $j) { 00179 $max=$try; 00180 } else if($candidates[$K[$try+1]]->b <= $j) { 00181 $min=$try+1; 00182 } else { // $try is less and $try+1 is more 00183 $s=$try; 00184 break; 00185 } 00186 if($max<=$min) { 00187 $s=-1; 00188 break; 00189 } 00190 } 00191 00192 if($s>-1) { 00193 if($candidates[$K[$s+1]]->b > $j) { 00194 // Create new candidate 00195 $index=count($candidates); 00196 $candidates[$index]=new StdClass; 00197 $candidates[$index]->a=$i; 00198 $candidates[$index]->b=$j; 00199 $candidates[$index]->previous=$K[$s]; 00200 $K[$r]=$c; 00201 $r=$s+1; 00202 $c=$index; // Or should this go before? 00203 } 00204 00205 if($s===$k) { 00206 $K[$k+2]=$K[$k+1]; 00207 $k++; 00208 break; 00209 } 00210 } 00211 00212 if($E[$p]->last) { 00213 break; 00214 } 00215 00216 $p++; 00217 } 00218 $K[$r]=$c; 00219 00220 } 00221 00222 // From Step 2 00223 function ouwiki_diff_sort_v($a,$b) { 00224 if($a->hash < $b->hash) { 00225 return -1; 00226 } else if($a->hash > $b->hash) { 00227 return 1; 00228 } else if($a->serial < $b->serial) { 00229 return -1; 00230 } else if($a->serial > $b->serial) { 00231 return 1; 00232 } else { 00233 return 0; 00234 } 00235 } 00236 00237 // From Step 4 00238 function ouwiki_diff_find_last(&$V,&$E,$hash) { 00239 // Binary search in $V until we find something with $hash 00240 00241 // Min = 1, array is 1-indexed 00242 $min=1; 00243 // Max = 1 higher than highest key 00244 end($V); 00245 $max=key($V)+1; 00246 while(true) { 00247 $try = (int)(($min+$max)/2); 00248 if($V[$try]->hash > $hash) { 00249 $max=$try; 00250 } else if($V[$try]->hash < $hash) { 00251 $min=$try+1; 00252 } else { // Equal 00253 break; 00254 } 00255 if($max<=$min) { 00256 // No matching line 00257 return 0; 00258 } 00259 } 00260 00261 // Now check back in $E to find the first line of that equivalence class 00262 for($j=$try;!$E[$j-1]->last;$j--) ; 00263 return $j; 00264 } 00265 00267 00268 00273 class ouwiki_line { 00275 var $words=array(); 00276 00282 function ouwiki_line($data,$linepos) { 00283 // 1. Turn things we don't want into spaces (so that positioning stays same) 00284 00285 // Whitespace replaced with space 00286 $data=preg_replace('/\s/',' ',$data); 00287 00288 // Various ways of writing non-breaking space replaced with space 00289 // Note that using a single param for replace only works because all 00290 // the search strings are 6 characters long 00291 $data=str_replace(array(' ',' ',' '),' ',$data); 00292 00293 // Tags replaced with equal number of spaces 00294 $data=preg_replace_callback('/<.*?'.'>/',create_function( 00295 '$matches','return preg_replace("/./"," ",$matches[0]);'),$data); 00296 00297 // 2. Analyse string so that each space-separated thing 00298 // is counted as a 'word' (note these may not be real words, 00299 // for instance words may include punctuation at either end) 00300 $pos=0; 00301 while(true) { 00302 // Find a non-space 00303 $strlendata = strlen($data); 00304 for(;$pos < $strlendata && substr($data,$pos,1)===' ';$pos++) ; 00305 if($pos==$strlendata) { 00306 // No more content 00307 break; 00308 } 00309 00310 // Aaaand find the next space after that 00311 $space2=strpos($data,' ',$pos); 00312 if($space2===false) { 00313 // No more spaces? Everything left must be a word 00314 $this->words[]=new ouwiki_word(substr($data,$pos),$pos+$linepos); 00315 break; 00316 } else { 00317 $this->words[]=new ouwiki_word(substr($data,$pos,$space2-$pos),$pos+$linepos); 00318 $pos=$space2; 00319 } 00320 } 00321 } 00322 00326 function get_as_string() { 00327 $result=''; 00328 foreach($this->words as $word) { 00329 if($result!=='') { 00330 $result.=' '; 00331 } 00332 $result.=$word->word; 00333 } 00334 return $result; 00335 } 00336 00342 function get_as_strings($lines) { 00343 $strings=array(); 00344 foreach($lines as $key=>$value) { 00345 $strings[$key]=$value->get_as_string(); 00346 } 00347 return $strings; 00348 } 00349 00350 00354 function is_empty() { 00355 return count($this->words)===0; 00356 } 00357 } 00358 00365 class ouwiki_word { 00367 var $word; 00369 var $start; 00370 00371 function ouwiki_word($word,$start) { 00372 $this->word=$word; 00373 $this->start=$start; 00374 } 00375 } 00376 00382 function ouwiki_diff_html_to_lines($content) { 00383 // These functions are a pain mostly because PHP preg_* don't provide 00384 // proper information as to the start/end position of matches. As a 00385 // consequence there is a lot of hackery going down. At every point we 00386 // replace things with spaces rather than getting rid, in order to store 00387 // positions within original content. 00388 00389 // Get rid of all script, style, object tags (that might contain non-text 00390 // outside tags) 00391 $content=preg_replace_callback( 00392 '^(<script .*?</script>)|(<object .*?</object>)|(<style .*?</style>)^i',create_function( 00393 '$matches','return preg_replace("/./"," ",$matches[0]);'),$content); 00394 00395 // Get rid of all ` symbols as we are going to use these for a marker later. 00396 $content=preg_replace('/[`]/',' ',$content); 00397 00398 // Put line breaks on block tags. Mark each line break with ` symbol 00399 $blocktags=array('p','div','h1','h2','h3','h4','h5','h6','td','li'); 00400 $taglist=''; 00401 foreach($blocktags as $blocktag) { 00402 if($taglist!=='') { 00403 $taglist.='|'; 00404 } 00405 $taglist.="<$blocktag>|<\\/$blocktag>"; 00406 } 00407 $content=preg_replace_callback('/(('.$taglist.')\s*)+/i',create_function( 00408 '$matches','return "`".preg_replace("/./"," ",substr($matches[0],1));'),$content); 00409 00410 // Now go through splitting each line 00411 $lines=array(); $index=1; 00412 $pos=0; 00413 while($pos<strlen($content)) { 00414 $nextline=strpos($content,'`',$pos); 00415 if($nextline===false) { 00416 // No more line breaks? Take content to end 00417 $nextline=strlen($content); 00418 } 00419 00420 $linestr=substr($content,$pos,$nextline-$pos); 00421 $line=new ouwiki_line($linestr,$pos); 00422 if(!$line->is_empty()) { 00423 $lines[$index++]=$line; 00424 } 00425 $pos=$nextline+1; 00426 } 00427 return $lines; 00428 } 00429 00434 class ouwiki_change_range { 00435 var $file1start,$file1count; 00436 var $file2start,$file2count; 00437 } 00438 00442 class ouwiki_changes { 00443 00445 var $adds; 00446 00448 var $deletes; 00449 00451 var $changes; 00452 00458 function ouwiki_changes($diff,$count2) { 00459 // Find deleted lines 00460 $this->deletes=self::internal_find_deletes($diff,$count2); 00461 00462 // Added lines work the same way after the comparison is 00463 // reversed. 00464 $this->adds=self::internal_find_deletes( 00465 ouwiki_diff_internal_flip($diff,$count2),count($diff)); 00466 00467 // Changed ranges are all the other lines from file 1 that 00468 // weren't found in file 2 but aren't deleted, and the 00469 // corresponding lines from file 2 (between the equivalent 00470 // 'found' lines). 00471 $this->changes=array(); 00472 $matchbefore=0; 00473 $inrange=-1; $lastrange=-1; 00474 foreach($diff as $index1=>$index2) { 00475 // Changed line if this isn't in 'deleted' section and 00476 // doesn't have a match in file2. 00477 if($index2===0 && !in_array($index1,$this->deletes)) { 00478 if($inrange===-1) { 00479 // Not already in a range, start a new one at array end 00480 $inrange=count($this->changes); 00481 $this->changes[$inrange]=new ouwiki_change_range; 00482 $this->changes[$inrange]->file1start=$index1; 00483 $this->changes[$inrange]->file1count=1; 00484 $this->changes[$inrange]->file2start=$matchbefore+1; // Last valid from file2 00485 $this->changes[$inrange]->file2count=0; 00486 $lastrange=$inrange; 00487 } else { 00488 // One more line that gets added to the range 00489 $this->changes[$inrange]->file1count++; 00490 } 00491 } else { 00492 // Not in a range any more 00493 $inrange=-1; 00494 // If we have a line match... 00495 if($index2!==0) { 00496 // Remember this line as next range must start after it 00497 $matchbefore=$index2; 00498 // If last range is still looking for a number, fill that in too 00499 if($lastrange!==-1) { 00500 $this->changes[$lastrange]->file2count=$index2 00501 -$this->changes[$lastrange]->file2start; 00502 $lastrange=-1; 00503 } 00504 } 00505 } 00506 } 00507 // Unfinished range in file2 gets end of file 00508 if($lastrange!==-1) { 00509 $this->changes[$lastrange]->file2count=$count2 00510 -$this->changes[$lastrange]->file2start+1; 00511 } 00512 } 00513 00522 function internal_find_deletes($diff,$count2) { 00523 $deletes=array(); 00524 00525 // 1. Create a new array that includes the lowest-valued 00526 // index2 value below each run of 0s. 00527 // I.e. if our array is say 1,2,0,0,0,3,0 then the 00528 // resulting array will be -,-,3,3,3,-,0 00529 $squidges=array(); 00530 $lowest=0; 00531 $countdiff = count($diff); 00532 for($index1=$countdiff;$index1>=1;$index1--) { 00533 $index2=$diff[$index1]; 00534 if($index2===0) { 00535 $squidges[$index1]=$lowest; 00536 } else { 00537 $lowest=$index2; 00538 } 00539 } 00540 00541 // 2. OK now we can use this new array to work out 00542 // items that are known to be deleted because we 00543 // have matching items either side 00544 $highest=0; 00545 foreach($diff as $index1=>$index2) { 00546 if($index2===0) { 00547 if($highest===$count2 || $highest+1===$squidges[$index1]) { 00548 // Yep! Definitely deleted. 00549 $deletes[]=$index1; 00550 } 00551 } else { 00552 $highest=$index2; 00553 } 00554 } 00555 return $deletes; 00556 } 00557 } 00558 00566 function ouwiki_diff_internal_flip($diff,$count2) { 00567 $flip=array(); 00568 for($i=1;$i<=$count2;$i++) { 00569 $flip[$i]=0; 00570 } 00571 foreach($diff as $index1=>$index2) { 00572 if($index2!==0) { 00573 $flip[$index2]=$index1; 00574 } 00575 } 00576 return $flip; 00577 } 00578 00587 function ouwiki_diff_words($lines1,$lines2) { 00588 // Prepare arrays 00589 $deleted=array(); 00590 $added=array(); 00591 // Get line difference 00592 $linediff=ouwiki_diff( 00593 ouwiki_line::get_as_strings($lines1), 00594 ouwiki_line::get_as_strings($lines2)); 00595 00596 // Handle lines that were entirely deleted 00597 foreach($linediff->deletes as $deletedline) { 00598 $deleted = array_merge($deleted, $lines1[$deletedline]->words); 00599 } 00600 // And ones that were entirely added 00601 foreach($linediff->adds as $addedline) { 00602 $added = array_merge($added, $lines2[$addedline]->words); 00603 } 00604 00605 // Changes get diffed at the individual-word level 00606 foreach($linediff->changes as $changerange) { 00607 // Build list of all words in each side of the range 00608 $file1words=array(); 00609 for($index=$changerange->file1start; 00610 $index<$changerange->file1start+$changerange->file1count;$index++) { 00611 foreach($lines1[$index]->words as $word) { 00612 $file1words[]=$word; 00613 } 00614 } 00615 $file2words=array(); 00616 for($index=$changerange->file2start; 00617 $index<$changerange->file2start+$changerange->file2count;$index++) { 00618 foreach($lines2[$index]->words as $word) { 00619 $file2words[]=$word; 00620 } 00621 } 00622 00623 // Make arrays 1-based 00624 array_unshift($file1words,'dummy'); 00625 unset($file1words[0]); 00626 array_unshift($file2words,'dummy'); 00627 unset($file2words[0]); 00628 00629 // Convert word lists into plain strings 00630 $file1strings=array(); 00631 foreach($file1words as $index=>$word) { 00632 $file1strings[$index]=$word->word; 00633 } 00634 $file2strings=array(); 00635 foreach($file2words as $index=>$word) { 00636 $file2strings[$index]=$word->word; 00637 } 00638 00639 // Run diff on strings 00640 $worddiff=ouwiki_diff($file1strings,$file2strings); 00641 foreach($worddiff->adds as $index) { 00642 $added[]=$file2words[$index]; 00643 } 00644 foreach($worddiff->deletes as $index) { 00645 $deleted[]=$file1words[$index]; 00646 } 00647 foreach($worddiff->changes as $changerange) { 00648 for($index=$changerange->file1start; 00649 $index<$changerange->file1start+$changerange->file1count;$index++) { 00650 $deleted[]=$file1words[$index]; 00651 } 00652 for($index=$changerange->file2start; 00653 $index<$changerange->file2start+$changerange->file2count;$index++) { 00654 $added[]=$file2words[$index]; 00655 } 00656 } 00657 } 00658 00659 return array($deleted,$added); 00660 } 00661 00669 function ouwiki_diff($file1,$file2) { 00670 return new ouwiki_changes(ouwiki_diff_internal($file1,$file2),count($file2)); 00671 } 00672 00680 function ouwiki_diff_add_markers($html,$words,$markerclass,$beforetext,$aftertext) { 00681 // Sort words by start position 00682 usort($words, create_function('$a,$b','return $a->start-$b->start;')); 00683 00684 // Add marker for each word. We use an odd tag name which will 00685 // be replaced by span later, this for ease of replacing 00686 $spanstart="<ouwiki_diff_add_markers>"; 00687 $pos=0; 00688 $result=''; 00689 foreach($words as $word) { 00690 // Add everything up to the word 00691 $result.=substr($html,$pos,$word->start-$pos); 00692 // Add word 00693 $result.=$spanstart.$word->word.'</ouwiki_diff_add_markers>'; 00694 // Update position 00695 $pos=$word->start+strlen($word->word); 00696 } 00697 00698 // Add everything after last word 00699 $result.=substr($html,$pos); 00700 00701 // If we end a marker then immediately start one, get rid of 00702 // both the end and start 00703 $result=preg_replace('^</ouwiki_diff_add_markers>(\s*)<ouwiki_diff_add_markers>^','$1',$result); 00704 00705 // Turn markers into proper span 00706 $result=preg_replace('^<ouwiki_diff_add_markers>^',$beforetext.'<span class="'.$markerclass.'">',$result); 00707 $result=preg_replace('^</ouwiki_diff_add_markers>^','</span>'.$aftertext,$result); 00708 00709 return $result; 00710 } 00711 00718 function ouwiki_diff_html($html1,$html2) { 00719 $lines1=ouwiki_diff_html_to_lines($html1); 00720 $lines2=ouwiki_diff_html_to_lines($html2); 00721 list($deleted,$added)=ouwiki_diff_words($lines1,$lines2); 00722 $result1=ouwiki_diff_add_markers($html1,$deleted,'ouw_deleted', 00723 '<strong class="accesshide">'.get_string('deletedbegins','wiki').'</strong>', 00724 '<strong class="accesshide">'.get_string('deletedends','wiki').'</strong>'); 00725 $result2=ouwiki_diff_add_markers($html2,$added,'ouw_added', 00726 '<strong class="accesshide">'.get_string('addedbegins','wiki').'</strong>', 00727 '<strong class="accesshide">'.get_string('addedends','wiki').'</strong>'); 00728 return array($result1,$result2); 00729 } 00730