|
Moodle
2.2.1
http://www.collinsharper.com
|
00001 <?php 00002 00003 // This file is part of Moodle - http://moodle.org/ 00004 // 00005 // Moodle is free software: you can redistribute it and/or modify 00006 // it under the terms of the GNU General Public License as published by 00007 // the Free Software Foundation, either version 3 of the License, or 00008 // (at your option) any later version. 00009 // 00010 // Moodle is distributed in the hope that it will be useful, 00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 // GNU General Public License for more details. 00014 // 00015 // You should have received a copy of the GNU General Public License 00016 // along with Moodle. If not, see <http://www.gnu.org/licenses/>. 00017 // 00018 // Author(s): Jon Abernathy <jon@chuggnutt.com> 00019 // Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> 00020 00100 class html2text 00101 { 00102 00109 var $html; 00110 00117 var $text; 00118 00128 var $width = 70; 00129 00138 var $search = array( 00139 "/\r/", // Non-legal carriage return 00140 "/[\n\t]+/", // Newlines and tabs 00141 '/[ ]{2,}/', // Runs of spaces, pre-handling 00142 '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 00143 '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 00144 //'/<!-- .* -->/', // Comments -- which strip_tags might have problem a with 00145 '/<p[^>]*>/i', // <P> 00146 '/<br[^>]*>/i', // <br> 00147 '/<i[^>]*>(.*?)<\/i>/i', // <i> 00148 '/<em[^>]*>(.*?)<\/em>/i', // <em> 00149 '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> 00150 '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> 00151 '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> 00152 '/<li[^>]*>/i', // <li> 00153 '/<hr[^>]*>/i', // <hr> 00154 '/(<table[^>]*>|<\/table>)/i', // <table> and </table> 00155 '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> 00156 '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> 00157 '/&(nbsp|#160);/i', // Non-breaking space 00158 '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i', 00159 // Double quotes 00160 '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes 00161 '/>/i', // Greater-than 00162 '/</i', // Less-than 00163 '/&(amp|#38);/i', // Ampersand 00164 '/&(copy|#169);/i', // Copyright 00165 '/&(trade|#8482|#153);/i', // Trademark 00166 '/&(reg|#174);/i', // Registered 00167 '/&(mdash|#151|#8212);/i', // mdash 00168 '/&(ndash|minus|#8211|#8722);/i', // ndash 00169 '/&(bull|#149|#8226);/i', // Bullet 00170 '/&(pound|#163);/i', // Pound sign 00171 '/&(euro|#8364);/i', // Euro sign 00172 '/[ ]+([\n\t])/', // Trailing spaces before newline or tab 00173 '/[ ]{2,}/' // Runs of spaces, post-handling 00174 ); 00175 00183 var $replace = array( 00184 '', // Non-legal carriage return 00185 ' ', // Newlines and tabs 00186 ' ', // Runs of spaces, pre-handling 00187 '', // <script>s -- which strip_tags supposedly has problems with 00188 '', // <style>s -- which strip_tags supposedly has problems with 00189 //'', // Comments -- which strip_tags might have problem a with 00190 "\n\n", // <P> 00191 "\n", // <br> 00192 '_\\1_', // <i> 00193 '_\\1_', // <em> 00194 "\n\n", // <ul> and </ul> 00195 "\n\n", // <ol> and </ol> 00196 "\t* \\1\n", // <li> and </li> 00197 "\n\t* ", // <li> 00198 "\n-------------------------\n", // <hr> 00199 "\n\n", // <table> and </table> 00200 "\n", // <tr> and </tr> 00201 "\t\t\\1\n", // <td> and </td> 00202 ' ', // Non-breaking space 00203 '"', // Double quotes 00204 "'", // Single quotes 00205 '>', 00206 '<', 00207 '&', 00208 '(c)', 00209 '(tm)', 00210 '(R)', 00211 '--', 00212 '-', 00213 '*', 00214 '£', 00215 'EUR', // Euro sign. € ? 00216 '\\1', // Trailing spaces before newline or tab 00217 ' ' // Runs of spaces, post-handling 00218 ); 00219 00227 var $callback_search = array( 00228 '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3 00229 '/<(b)[^>]*>(.*?)<\/b>/i', // <b> 00230 '/<(strong)[^>]*>(.*?)<\/strong>/i', // <strong> 00231 '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', 00232 // <a href=""> 00233 '/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th> 00234 '/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt 00235 ); 00236 00245 var $pre_search = array( 00246 "/\n/", 00247 "/\t/", 00248 '/ /', 00249 '/<pre[^>]*>/', 00250 '/<\/pre>/' 00251 ); 00252 00260 var $pre_replace = array( 00261 '<br>', 00262 ' ', 00263 ' ', 00264 '', 00265 '' 00266 ); 00267 00275 var $allowed_tags = ''; 00276 00283 var $url; 00284 00292 var $_converted = false; 00293 00301 var $_link_list = ''; 00302 00311 var $_link_count = 0; 00312 00320 var $_do_links = true; 00321 00336 function html2text( $source = '', $from_file = false, $do_links = true, $width = 75 ) 00337 { 00338 if ($source !== '') { 00339 $this->set_html($source, $from_file); 00340 } 00341 00342 $this->set_base_url(); 00343 $this->_do_links = $do_links; 00344 $this->width = $width; 00345 } 00346 00355 function set_html( $source, $from_file = false ) 00356 { 00357 if ( $from_file && file_exists($source) ) { 00358 $this->html = file_get_contents($source); 00359 } 00360 else 00361 $this->html = $source; 00362 00363 $this->_converted = false; 00364 } 00365 00372 function get_text() 00373 { 00374 if ( !$this->_converted ) { 00375 $this->_convert(); 00376 } 00377 00378 return $this->text; 00379 } 00380 00387 function print_text() 00388 { 00389 print $this->get_text(); 00390 } 00391 00399 function p() 00400 { 00401 print $this->get_text(); 00402 } 00403 00412 function set_allowed_tags( $allowed_tags = '' ) 00413 { 00414 if ( !empty($allowed_tags) ) { 00415 $this->allowed_tags = $allowed_tags; 00416 } 00417 } 00418 00425 function set_base_url( $url = '' ) 00426 { 00427 if ( empty($url) ) { 00428 if ( !empty($_SERVER['HTTP_HOST']) ) { 00429 $this->url = 'http://' . $_SERVER['HTTP_HOST']; 00430 } else { 00431 $this->url = ''; 00432 } 00433 } else { 00434 // Strip any trailing slashes for consistency (relative 00435 // URLs may already start with a slash like "/file.html") 00436 if ( substr($url, -1) == '/' ) { 00437 $url = substr($url, 0, -1); 00438 } 00439 $this->url = $url; 00440 } 00441 } 00442 00454 function _convert() 00455 { 00456 // Variables used for building the link list 00457 $this->_link_count = 0; 00458 $this->_link_list = ''; 00459 00460 $text = trim($this->html); 00461 00462 // Convert <PRE> 00463 $this->_convert_pre($text); 00464 00465 // Run our defined search-and-replace 00466 $text = preg_replace($this->search, $this->replace, $text); 00467 $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text); 00468 00469 // Replace known html entities 00470 $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); 00471 00472 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 00473 $text = preg_replace('/&[^&;]+;/i', '', $text); 00474 00475 // Strip any other HTML tags 00476 $text = strip_tags($text, $this->allowed_tags); 00477 00478 // Bring down number of empty lines to 2 max 00479 $text = preg_replace("/\n\s+\n/", "\n\n", $text); 00480 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 00481 00482 // Add link list 00483 if ( !empty($this->_link_list) ) { 00484 $text .= "\n\nLinks:\n------\n" . $this->_link_list; 00485 } 00486 00487 // Wrap the text to a readable format 00488 // for PHP versions >= 4.0.2. Default width is 75 00489 // If width is 0 or less, don't wrap the text. 00490 if ( $this->width > 0 ) { 00491 $text = wordwrap($text, $this->width); 00492 } 00493 00494 $this->text = $text; 00495 00496 $this->_converted = true; 00497 } 00498 00512 function _build_link_list( $link, $display ) 00513 { 00514 if ( !$this->_do_links ) return $display; 00515 00516 if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || 00517 substr($link, 0, 7) == 'mailto:' ) { 00518 $this->_link_count++; 00519 $this->_link_list .= "[" . $this->_link_count . "] $link\n"; 00520 $additional = ' [' . $this->_link_count . ']'; 00521 } elseif ( substr($link, 0, 11) == 'javascript:' ) { 00522 // Don't count the link; ignore it 00523 $additional = ''; 00524 // what about href="#anchor" ? 00525 } else { 00526 $this->_link_count++; 00527 $this->_link_list .= "[" . $this->_link_count . "] " . $this->url; 00528 if ( substr($link, 0, 1) != '/' ) { 00529 $this->_link_list .= '/'; 00530 } 00531 $this->_link_list .= "$link\n"; 00532 $additional = ' [' . $this->_link_count . ']'; 00533 } 00534 00535 return $display . $additional; 00536 } 00537 00544 function _convert_pre(&$text) 00545 { 00546 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 00547 // convert the content 00548 $this->pre_content = sprintf('<div><br>%s<br></div>', 00549 preg_replace($this->pre_search, $this->pre_replace, $matches[1])); 00550 // replace the content (use callback because content can contain $0 variable) 00551 $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', 00552 array('html2text', '_preg_pre_callback'), $text, 1); 00553 // free memory 00554 $this->pre_content = ''; 00555 } 00556 } 00557 00565 function _preg_callback($matches) 00566 { 00567 switch($matches[1]) { 00568 case 'b': 00569 case 'strong': 00570 return $this->_strtoupper($matches[2]); 00571 case 'hr': 00572 return $this->_strtoupper("\t\t". $matches[2] ."\n"); 00573 case 'h': 00574 return $this->_strtoupper("\n\n". $matches[2] ."\n\n"); 00575 case 'a': 00576 return $this->_build_link_list($matches[3], $matches[4]); 00577 case 'img': 00578 return '[' . $matches[2] . ']'; 00579 } 00580 } 00581 00588 private function _preg_pre_callback($matches) 00589 { 00590 return $this->pre_content; 00591 } 00592 00600 function _strtoupper($str) 00601 { 00602 $tl = textlib_get_instance(); 00603 return $tl->strtoupper($str); 00604 } 00605 }