|
Moodle
2.2.1
http://www.collinsharper.com
|
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the Typo3 project. The Typo3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * 00017 * This script is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 * GNU General Public License for more details. 00021 * 00022 * This copyright notice MUST APPEAR in all copies of the script! 00023 ***************************************************************/ 00129 class t3lib_cs { 00130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent. 00131 00132 // This is the array where parsed conversion tables are stored (cached) 00133 var $parsedCharsets = array(); 00134 00135 // An array where case folding data will be stored (cached) 00136 var $caseFolding = array(); 00137 00138 // An array where charset-to-ASCII mappings are stored (cached) 00139 var $toASCII = array(); 00140 00141 // This tells the converter which charsets has two bytes per char: 00142 var $twoByteSets = array( 00143 'ucs-2' => 1, // 2-byte Unicode 00144 ); 00145 00146 // This tells the converter which charsets has four bytes per char: 00147 var $fourByteSets = array( 00148 'ucs-4' => 1, // 4-byte Unicode 00149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 00150 ); 00151 00152 // This tells the converter which charsets use a scheme like the Extended Unix Code: 00153 var $eucBasedSets = array( 00154 'gb2312' => 1, // Chinese, simplified. 00155 'big5' => 1, // Chinese, traditional. 00156 'euc-kr' => 1, // Korean 00157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 00158 ); 00159 00160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 00161 // http://czyborra.com/charsets/iso8859.html 00162 var $synonyms = array( 00163 'us' => 'ascii', 00164 'us-ascii' => 'ascii', 00165 'cp819' => 'iso-8859-1', 00166 'ibm819' => 'iso-8859-1', 00167 'iso-ir-100' => 'iso-8859-1', 00168 'iso-ir-101' => 'iso-8859-2', 00169 'iso-ir-109' => 'iso-8859-3', 00170 'iso-ir-110' => 'iso-8859-4', 00171 'iso-ir-144' => 'iso-8859-5', 00172 'iso-ir-127' => 'iso-8859-6', 00173 'iso-ir-126' => 'iso-8859-7', 00174 'iso-ir-138' => 'iso-8859-8', 00175 'iso-ir-148' => 'iso-8859-9', 00176 'iso-ir-157' => 'iso-8859-10', 00177 'iso-ir-179' => 'iso-8859-13', 00178 'iso-ir-199' => 'iso-8859-14', 00179 'iso-ir-203' => 'iso-8859-15', 00180 'csisolatin1' => 'iso-8859-1', 00181 'csisolatin2' => 'iso-8859-2', 00182 'csisolatin3' => 'iso-8859-3', 00183 'csisolatin5' => 'iso-8859-9', 00184 'csisolatin8' => 'iso-8859-14', 00185 'csisolatin9' => 'iso-8859-15', 00186 'csisolatingreek' => 'iso-8859-7', 00187 'iso-celtic' => 'iso-8859-14', 00188 'latin1' => 'iso-8859-1', 00189 'latin2' => 'iso-8859-2', 00190 'latin3' => 'iso-8859-3', 00191 'latin5' => 'iso-8859-9', 00192 'latin6' => 'iso-8859-10', 00193 'latin8' => 'iso-8859-14', 00194 'latin9' => 'iso-8859-15', 00195 'l1' => 'iso-8859-1', 00196 'l2' => 'iso-8859-2', 00197 'l3' => 'iso-8859-3', 00198 'l5' => 'iso-8859-9', 00199 'l6' => 'iso-8859-10', 00200 'l8' => 'iso-8859-14', 00201 'l9' => 'iso-8859-15', 00202 'cyrillic' => 'iso-8859-5', 00203 'arabic' => 'iso-8859-6', 00204 'tis-620' => 'iso-8859-11', 00205 'win874' => 'windows-874', 00206 'win1250' => 'windows-1250', 00207 'win1251' => 'windows-1251', 00208 'win1252' => 'windows-1252', 00209 'win1253' => 'windows-1253', 00210 'win1254' => 'windows-1254', 00211 'win1255' => 'windows-1255', 00212 'win1256' => 'windows-1256', 00213 'win1257' => 'windows-1257', 00214 'win1258' => 'windows-1258', 00215 'cp1250' => 'windows-1250', 00216 'cp1251' => 'windows-1251', 00217 'cp1252' => 'windows-1252', 00218 'ms-ee' => 'windows-1250', 00219 'ms-ansi' => 'windows-1252', 00220 'ms-greek' => 'windows-1253', 00221 'ms-turk' => 'windows-1254', 00222 'winbaltrim' => 'windows-1257', 00223 'koi-8ru' => 'koi-8r', 00224 'koi8r' => 'koi-8r', 00225 'cp878' => 'koi-8r', 00226 'mac' => 'macroman', 00227 'macintosh' => 'macroman', 00228 'euc-cn' => 'gb2312', 00229 'x-euc-cn' => 'gb2312', 00230 'euccn' => 'gb2312', 00231 'cp936' => 'gb2312', 00232 'big-5' => 'big5', 00233 'cp950' => 'big5', 00234 'eucjp' => 'euc-jp', 00235 'sjis' => 'shift_jis', 00236 'shift-jis' => 'shift_jis', 00237 'cp932' => 'shift_jis', 00238 'cp949' => 'euc-kr', 00239 'utf7' => 'utf-7', 00240 'utf8' => 'utf-8', 00241 'utf16' => 'utf-16', 00242 'utf32' => 'utf-32', 00243 'utf8' => 'utf-8', 00244 'ucs2' => 'ucs-2', 00245 'ucs4' => 'ucs-4', 00246 ); 00247 00248 // mapping of iso-639-1 language codes to script names 00249 var $lang_to_script = array( 00250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php 00251 'ar' => 'arabic', 00252 'bg' => 'cyrillic', // Bulgarian 00253 'bs' => 'east_european', // Bosnian 00254 'cs' => 'east_european', // Czech 00255 'da' => 'west_european', // Danish 00256 'de' => 'west_european', // German 00257 'es' => 'west_european', // Spanish 00258 'et' => 'estonian', 00259 'eo' => 'unicode', // Esperanto 00260 'eu' => 'west_european', // Basque 00261 'fa' => 'arabic', // Persian 00262 'fi' => 'west_european', // Finish 00263 'fo' => 'west_european', // Faroese 00264 'fr' => 'west_european', // French 00265 'ga' => 'west_european', // Irish 00266 'gl' => 'west_european', // Galician 00267 'gr' => 'greek', 00268 'he' => 'hebrew', // Hebrew (since 1998) 00269 'hi' => 'unicode', // Hindi 00270 'hr' => 'east_european', // Croatian 00271 'hu' => 'east_european', // Hungarian 00272 'iw' => 'hebrew', // Hebrew (til 1998) 00273 'is' => 'west_european', // Icelandic 00274 'it' => 'west_european', // Italian 00275 'ja' => 'japanese', 00276 'ka' => 'unicode', // Georgian 00277 'kl' => 'west_european', // Greenlandic 00278 'km' => 'unicode', // Khmer 00279 'ko' => 'korean', 00280 'lt' => 'lithuanian', 00281 'lv' => 'west_european', // Latvian/Lettish 00282 'nl' => 'west_european', // Dutch 00283 'no' => 'west_european', // Norwegian 00284 'nb' => 'west_european', // Norwegian Bokmal 00285 'nn' => 'west_european', // Norwegian Nynorsk 00286 'pl' => 'east_european', // Polish 00287 'pt' => 'west_european', // Portuguese 00288 'ro' => 'east_european', // Romanian 00289 'ru' => 'cyrillic', // Russian 00290 'sk' => 'east_european', // Slovak 00291 'sl' => 'east_european', // Slovenian 00292 'sr' => 'cyrillic', // Serbian 00293 'sv' => 'west_european', // Swedish 00294 'sq' => 'albanian', // Albanian 00295 'th' => 'thai', 00296 'uk' => 'cyrillic', // Ukranian 00297 'vi' => 'vietnamese', 00298 'zh' => 'chinese', 00299 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 00300 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp 00301 'ara' => 'arabic', 00302 'bgr' => 'cyrillic', // Bulgarian 00303 'cat' => 'west_european', // Catalan 00304 'chs' => 'simpl_chinese', 00305 'cht' => 'trad_chinese', 00306 'csy' => 'east_european', // Czech 00307 'dan' => 'west_european', // Danisch 00308 'deu' => 'west_european', // German 00309 'dea' => 'west_european', // German (Austrian) 00310 'des' => 'west_european', // German (Swiss) 00311 'ena' => 'west_european', // English (Australian) 00312 'enc' => 'west_european', // English (Canadian) 00313 'eng' => 'west_european', // English 00314 'enz' => 'west_european', // English (New Zealand) 00315 'enu' => 'west_european', // English (United States) 00316 'euq' => 'west_european', // Basque 00317 'fos' => 'west_european', // Faroese 00318 'far' => 'arabic', // Persian 00319 'fin' => 'west_european', // Finish 00320 'fra' => 'west_european', // French 00321 'frb' => 'west_european', // French (Belgian) 00322 'frc' => 'west_european', // French (Canadian) 00323 'frs' => 'west_european', // French (Swiss) 00324 'geo' => 'unicode', // Georgian 00325 'glg' => 'west_european', // Galician 00326 'ell' => 'greek', 00327 'heb' => 'hebrew', 00328 'hin' => 'unicode', // Hindi 00329 'hun' => 'east_european', // Hungarian 00330 'isl' => 'west_euorpean', // Icelandic 00331 'ita' => 'west_european', // Italian 00332 'its' => 'west_european', // Italian (Swiss) 00333 'jpn' => 'japanese', 00334 'khm' => 'unicode', // Khmer 00335 'kor' => 'korean', 00336 'lth' => 'lithuanian', 00337 'lvi' => 'west_european', // Latvian/Lettish 00338 'msl' => 'west_european', // Malay 00339 'nlb' => 'west_european', // Dutch (Belgian) 00340 'nld' => 'west_european', // Dutch 00341 'nor' => 'west_european', // Norwegian (bokmal) 00342 'non' => 'west_european', // Norwegian (nynorsk) 00343 'plk' => 'east_european', // Polish 00344 'ptg' => 'west_european', // Portuguese 00345 'ptb' => 'west_european', // Portuguese (Brazil) 00346 'rom' => 'east_european', // Romanian 00347 'rus' => 'cyrillic', // Russian 00348 'slv' => 'east_european', // Slovenian 00349 'sky' => 'east_european', // Slovak 00350 'srl' => 'east_european', // Serbian (Latin) 00351 'srb' => 'cyrillic', // Serbian (Cyrillic) 00352 'esp' => 'west_european', // Spanish (trad. sort) 00353 'esm' => 'west_european', // Spanish (Mexican) 00354 'esn' => 'west_european', // Spanish (internat. sort) 00355 'sve' => 'west_european', // Swedish 00356 'sqi' => 'albanian', // Albanian 00357 'tha' => 'thai', 00358 'trk' => 'turkish', 00359 'ukr' => 'cyrillic', // Ukrainian 00360 // English language names 00361 'albanian' => 'albanian', 00362 'arabic' => 'arabic', 00363 'basque' => 'west_european', 00364 'bosnian' => 'east_european', 00365 'bulgarian' => 'east_european', 00366 'catalan' => 'west_european', 00367 'croatian' => 'east_european', 00368 'czech' => 'east_european', 00369 'danish' => 'west_european', 00370 'dutch' => 'west_european', 00371 'english' => 'west_european', 00372 'esperanto' => 'unicode', 00373 'estonian' => 'estonian', 00374 'faroese' => 'west_european', 00375 'farsi' => 'arabic', 00376 'finnish' => 'west_european', 00377 'french' => 'west_european', 00378 'galician' => 'west_european', 00379 'georgian' => 'unicode', 00380 'german' => 'west_european', 00381 'greek' => 'greek', 00382 'greenlandic' => 'west_european', 00383 'hebrew' => 'hebrew', 00384 'hindi' => 'unicode', 00385 'hungarian' => 'east_european', 00386 'icelandic' => 'west_european', 00387 'italian' => 'west_european', 00388 'khmer' => 'unicode', 00389 'latvian' => 'west_european', 00390 'lettish' => 'west_european', 00391 'lithuanian' => 'lithuanian', 00392 'malay' => 'west_european', 00393 'norwegian' => 'west_european', 00394 'persian' => 'arabic', 00395 'polish' => 'east_european', 00396 'portuguese' => 'west_european', 00397 'russian' => 'cyrillic', 00398 'romanian' => 'east_european', 00399 'serbian' => 'cyrillic', 00400 'slovak' => 'east_european', 00401 'slovenian' => 'east_european', 00402 'spanish' => 'west_european', 00403 'svedish' => 'west_european', 00404 'that' => 'thai', 00405 'turkish' => 'turkish', 00406 'ukrainian' => 'cyrillic', 00407 ); 00408 00409 // mapping of language (family) names to charsets on Unix 00410 var $script_to_charset_unix = array( 00411 'west_european' => 'iso-8859-1', 00412 'estonian' => 'iso-8859-1', 00413 'east_european' => 'iso-8859-2', 00414 'baltic' => 'iso-8859-4', 00415 'cyrillic' => 'iso-8859-5', 00416 'arabic' => 'iso-8859-6', 00417 'greek' => 'iso-8859-7', 00418 'hebrew' => 'iso-8859-8', 00419 'turkish' => 'iso-8859-9', 00420 'thai' => 'iso-8859-11', // = TIS-620 00421 'lithuanian' => 'iso-8859-13', 00422 'chinese' => 'gb2312', // = euc-cn 00423 'japanese' => 'euc-jp', 00424 'korean' => 'euc-kr', 00425 'simpl_chinese' => 'gb2312', 00426 'trad_chinese' => 'big5', 00427 'vietnamese' => '', 00428 'unicode' => 'utf-8', 00429 'albanian' => 'utf-8' 00430 ); 00431 00432 // mapping of language (family) names to charsets on Windows 00433 var $script_to_charset_windows = array( 00434 'east_european' => 'windows-1250', 00435 'cyrillic' => 'windows-1251', 00436 'west_european' => 'windows-1252', 00437 'greek' => 'windows-1253', 00438 'turkish' => 'windows-1254', 00439 'hebrew' => 'windows-1255', 00440 'arabic' => 'windows-1256', 00441 'baltic' => 'windows-1257', 00442 'estonian' => 'windows-1257', 00443 'lithuanian' => 'windows-1257', 00444 'vietnamese' => 'windows-1258', 00445 'thai' => 'cp874', 00446 'korean' => 'cp949', 00447 'chinese' => 'gb2312', 00448 'japanese' => 'shift_jis', 00449 'simpl_chinese' => 'gb2312', 00450 'trad_chinese' => 'big5', 00451 'albanian' => 'windows-1250', 00452 'unicode' => 'utf-8' 00453 ); 00454 00455 // mapping of locale names to charsets 00456 var $locale_to_charset = array( 00457 'japanese.euc' => 'euc-jp', 00458 'ja_jp.ujis' => 'euc-jp', 00459 'korean.euc' => 'euc-kr', 00460 'sr@Latn' => 'iso-8859-2', 00461 'zh_cn' => 'gb2312', 00462 'zh_hk' => 'big5', 00463 'zh_tw' => 'big5', 00464 ); 00465 00466 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 00467 // Empty values means "iso-8859-1" 00468 var $charSetArray = array( 00469 'dk' => '', 00470 'de' => '', 00471 'no' => '', 00472 'it' => '', 00473 'fr' => '', 00474 'es' => '', 00475 'nl' => '', 00476 'cz' => 'windows-1250', 00477 'pl' => 'iso-8859-2', 00478 'si' => 'windows-1250', 00479 'fi' => '', 00480 'tr' => 'iso-8859-9', 00481 'se' => '', 00482 'pt' => '', 00483 'ru' => 'windows-1251', 00484 'ro' => 'iso-8859-2', 00485 'ch' => 'gb2312', 00486 'sk' => 'windows-1250', 00487 'lt' => 'windows-1257', 00488 'is' => 'utf-8', 00489 'hr' => 'windows-1250', 00490 'hu' => 'iso-8859-2', 00491 'gl' => '', 00492 'th' => 'iso-8859-11', 00493 'gr' => 'iso-8859-7', 00494 'hk' => 'big5', 00495 'eu' => '', 00496 'bg' => 'windows-1251', 00497 'br' => '', 00498 'et' => 'iso-8859-4', 00499 'ar' => 'iso-8859-6', 00500 'he' => 'utf-8', 00501 'ua' => 'windows-1251', 00502 'jp' => 'shift_jis', 00503 'lv' => 'utf-8', 00504 'vn' => 'utf-8', 00505 'ca' => 'iso-8859-15', 00506 'ba' => 'iso-8859-2', 00507 'kr' => 'euc-kr', 00508 'eo' => 'utf-8', 00509 'my' => '', 00510 'hi' => 'utf-8', 00511 'fo' => 'utf-8', 00512 'fa' => 'utf-8', 00513 'sr' => 'utf-8', 00514 'sq' => 'utf-8', 00515 'ge' => 'utf-8', 00516 'ga' => '', 00517 'km' => 'utf-8', 00518 'qc' => '', 00519 ); 00520 00521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 00522 // Missing keys means: same as Typo3 00523 var $isoArray = array( 00524 'ba' => 'bs', 00525 'br' => 'pt_BR', 00526 'ch' => 'zh_CN', 00527 'cz' => 'cs', 00528 'dk' => 'da', 00529 'si' => 'sl', 00530 'se' => 'sv', 00531 'gl' => 'kl', 00532 'gr' => 'el', 00533 'hk' => 'zh_HK', 00534 'kr' => 'ko', 00535 'ua' => 'uk', 00536 'jp' => 'ja', 00537 'qc' => 'fr_CA', 00538 'vn' => 'vi', 00539 'ge' => 'ka', 00540 'ga' => 'gl', 00541 ); 00542 00550 function parse_charset($charset) { 00551 $charset = trim(strtolower($charset)); 00552 if (isset($this->synonyms[$charset])) { 00553 $charset = $this->synonyms[$charset]; 00554 } 00555 00556 return $charset; 00557 } 00558 00571 function get_locale_charset($locale) { 00572 $locale = strtolower($locale); 00573 00574 // exact locale specific charset? 00575 if (isset($this->locale_to_charset[$locale])) { 00576 return $this->locale_to_charset[$locale]; 00577 } 00578 00579 // get modifier 00580 list($locale, $modifier) = explode('@', $locale); 00581 00582 // locale contains charset: use it 00583 list($locale, $charset) = explode('.', $locale); 00584 if ($charset) { 00585 return $this->parse_charset($charset); 00586 } 00587 00588 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 00589 if ($modifier == 'euro') { 00590 return 'iso-8859-15'; 00591 } 00592 00593 // get language 00594 list($language, $country) = explode('_', $locale); 00595 if (isset($this->lang_to_script[$language])) { 00596 $script = $this->lang_to_script[$language]; 00597 } 00598 00599 if (TYPO3_OS == 'WIN') { 00600 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252'; 00601 } else { 00602 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1'; 00603 } 00604 00605 return $cs; 00606 } 00607 00608 00609 /******************************************** 00610 * 00611 * Charset Conversion functions 00612 * 00613 ********************************************/ 00614 00625 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) { 00626 if ($fromCS == $toCS) { 00627 return $str; 00628 } 00629 00630 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 00631 if ($toCS == 'utf-8' || !$useEntityForNoChar) { 00632 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 00633 case 'mbstring': 00634 $conv_str = mb_convert_encoding($str, $toCS, $fromCS); 00635 if (FALSE !== $conv_str) { 00636 return $conv_str; 00637 } // returns false for unsupported charsets 00638 break; 00639 00640 case 'iconv': 00641 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str); 00642 if (FALSE !== $conv_str) { 00643 return $conv_str; 00644 } 00645 break; 00646 00647 case 'recode': 00648 $conv_str = recode_string($fromCS . '..' . $toCS, $str); 00649 if (FALSE !== $conv_str) { 00650 return $conv_str; 00651 } 00652 break; 00653 } 00654 // fallback to TYPO3 conversion 00655 } 00656 00657 if ($fromCS != 'utf-8') { 00658 $str = $this->utf8_encode($str, $fromCS); 00659 } 00660 if ($toCS != 'utf-8') { 00661 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar); 00662 } 00663 return $str; 00664 } 00665 00677 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) { 00678 foreach ($array as $key => $value) { 00679 if (is_array($array[$key])) { 00680 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar); 00681 } elseif (is_string($array[$key])) { 00682 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar); 00683 } 00684 } 00685 } 00686 00694 function utf8_encode($str, $charset) { 00695 00696 if ($charset === 'utf-8') { 00697 return $str; 00698 } 00699 00700 // Charset is case-insensitive. 00701 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00702 $strLen = strlen($str); 00703 $outStr = ''; 00704 00705 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string. 00706 $chr = substr($str, $a, 1); 00707 $ord = ord($chr); 00708 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 00709 $ord2 = ord($str{$a + 1}); 00710 $ord = $ord << 8 | $ord2; // assume big endian 00711 00712 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00713 $outStr .= $this->parsedCharsets[$charset]['local'][$ord]; 00714 } else { 00715 $outStr .= chr($this->noCharByteVal); 00716 } // No char exists 00717 $a++; 00718 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8 00719 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 00720 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 00721 $a++; 00722 $ord2 = ord(substr($str, $a, 1)); 00723 $ord = $ord * 256 + $ord2; 00724 } 00725 } 00726 00727 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00728 $outStr .= $this->parsedCharsets[$charset]['local'][$ord]; 00729 } else { 00730 $outStr .= chr($this->noCharByteVal); 00731 } // No char exists 00732 } else { 00733 $outStr .= $chr; 00734 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00735 } 00736 return $outStr; 00737 } 00738 } 00739 00748 function utf8_decode($str, $charset, $useEntityForNoChar = 0) { 00749 00750 if ($charset === 'utf-8') { 00751 return $str; 00752 } 00753 00754 // Charset is case-insensitive. 00755 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00756 $strLen = strlen($str); 00757 $outStr = ''; 00758 $buf = ''; 00759 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string. 00760 $chr = substr($str, $a, 1); 00761 $ord = ord($chr); 00762 if ($ord > 127) { // This means multibyte! (first byte!) 00763 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00764 00765 $buf = $chr; // Add first byte 00766 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00767 $ord = $ord << 1; // Shift it left and ... 00768 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00769 $a++; // Increase pointer... 00770 $buf .= substr($str, $a, 1); // ... and add the next char. 00771 } else { 00772 break; 00773 } 00774 } 00775 00776 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 00777 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 00778 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 00779 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255); 00780 } else { 00781 $outStr .= chr($mByte); 00782 } 00783 } elseif ($useEntityForNoChar) { // Create num entity: 00784 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';'; 00785 } else { 00786 $outStr .= chr($this->noCharByteVal); 00787 } // No char exists 00788 } else { 00789 $outStr .= chr($this->noCharByteVal); 00790 } // No char exists (MIDDLE of MB sequence!) 00791 } else { 00792 $outStr .= $chr; 00793 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00794 } 00795 return $outStr; 00796 } 00797 } 00798 00805 function utf8_to_entities($str) { 00806 $strLen = strlen($str); 00807 $outStr = ''; 00808 $buf = ''; 00809 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string. 00810 $chr = substr($str, $a, 1); 00811 $ord = ord($chr); 00812 if ($ord > 127) { // This means multibyte! (first byte!) 00813 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00814 $buf = $chr; // Add first byte 00815 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00816 $ord = $ord << 1; // Shift it left and ... 00817 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00818 $a++; // Increase pointer... 00819 $buf .= substr($str, $a, 1); // ... and add the next char. 00820 } else { 00821 break; 00822 } 00823 } 00824 00825 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';'; 00826 } else { 00827 $outStr .= chr($this->noCharByteVal); 00828 } // No char exists (MIDDLE of MB sequence!) 00829 } else { 00830 $outStr .= $chr; 00831 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00832 } 00833 00834 return $outStr; 00835 } 00836 00844 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) { 00845 if ($alsoStdHtmlEnt) { 00846 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 00847 } 00848 00849 $token = md5(microtime()); 00850 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str)); 00851 foreach ($parts as $k => $v) { 00852 if ($k % 2) { 00853 if (substr($v, 0, 1) == '#') { // Dec or hex entities: 00854 if (substr($v, 1, 1) == 'x') { 00855 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2))); 00856 } else { 00857 $parts[$k] = $this->UnumberToChar(substr($v, 1)); 00858 } 00859 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities: 00860 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1'); 00861 } else { // No conversion: 00862 $parts[$k] = '&' . $v . ';'; 00863 } 00864 } 00865 } 00866 00867 return implode('', $parts); 00868 } 00869 00878 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) { 00879 // If entities must be registered as well...: 00880 if ($convEntities) { 00881 $str = $this->entities_to_utf8($str, 1); 00882 } 00883 // Do conversion: 00884 $strLen = strlen($str); 00885 $outArr = array(); 00886 $buf = ''; 00887 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string. 00888 $chr = substr($str, $a, 1); 00889 $ord = ord($chr); 00890 if ($ord > 127) { // This means multibyte! (first byte!) 00891 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00892 $buf = $chr; // Add first byte 00893 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00894 $ord = $ord << 1; // Shift it left and ... 00895 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00896 $a++; // Increase pointer... 00897 $buf .= substr($str, $a, 1); // ... and add the next char. 00898 } else { 00899 break; 00900 } 00901 } 00902 00903 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf); 00904 } else { 00905 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal; 00906 } // No char exists (MIDDLE of MB sequence!) 00907 } else { 00908 $outArr[] = $retChar ? chr($ord) : $ord; 00909 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00910 } 00911 00912 return $outArr; 00913 } 00914 00934 function UnumberToChar($cbyte) { 00935 $str = ''; 00936 00937 if ($cbyte < 0x80) { 00938 $str .= chr($cbyte); 00939 } else { 00940 if ($cbyte < 0x800) { 00941 $str .= chr(0xC0 | ($cbyte >> 6)); 00942 $str .= chr(0x80 | ($cbyte & 0x3F)); 00943 } else { 00944 if ($cbyte < 0x10000) { 00945 $str .= chr(0xE0 | ($cbyte >> 12)); 00946 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00947 $str .= chr(0x80 | ($cbyte & 0x3F)); 00948 } else { 00949 if ($cbyte < 0x200000) { 00950 $str .= chr(0xF0 | ($cbyte >> 18)); 00951 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 00952 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00953 $str .= chr(0x80 | ($cbyte & 0x3F)); 00954 } else { 00955 if ($cbyte < 0x4000000) { 00956 $str .= chr(0xF8 | ($cbyte >> 24)); 00957 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F)); 00958 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 00959 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00960 $str .= chr(0x80 | ($cbyte & 0x3F)); 00961 } else { 00962 if ($cbyte < 0x80000000) { 00963 $str .= chr(0xFC | ($cbyte >> 30)); 00964 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F)); 00965 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F)); 00966 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 00967 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 00968 $str .= chr(0x80 | ($cbyte & 0x3F)); 00969 } else { // Cannot express a 32-bit character in UTF-8 00970 $str .= chr($this->noCharByteVal); 00971 } 00972 } 00973 } 00974 } 00975 } 00976 } 00977 return $str; 00978 } 00979 00989 function utf8CharToUnumber($str, $hex = 0) { 00990 $ord = ord(substr($str, 0, 1)); // First char 00991 00992 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 00993 $binBuf = ''; 00994 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 00995 $ord = $ord << 1; // Shift it left and ... 00996 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00997 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6); 00998 } else { 00999 break; 01000 } 01001 } 01002 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf; 01003 01004 $int = bindec($binBuf); 01005 } else { 01006 $int = $ord; 01007 } 01008 01009 return $hex ? 'x' . dechex($int) : $int; 01010 } 01011 01012 01013 /******************************************** 01014 * 01015 * Init functions 01016 * 01017 ********************************************/ 01018 01029 function initCharset($charset) { 01030 // Only process if the charset is not yet loaded: 01031 if (empty($this->parsedCharsets[$charset]) || !is_array($this->parsedCharsets[$charset])) { 01032 01033 // Conversion table filename: 01034 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl'; 01035 01036 // If the conversion table is found: 01037 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 01038 // Cache file for charsets: 01039 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 01040 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl'); 01041 if ($cacheFile && @is_file($cacheFile)) { 01042 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01043 } else { 01044 // Parse conversion table into lines: 01045 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1); 01046 // Initialize the internal variable holding the conv. table: 01047 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array()); 01048 // traverse the lines: 01049 $detectedType = ''; 01050 foreach ($lines as $value) { 01051 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored. 01052 01053 // Detect type if not done yet: (Done on first real line) 01054 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 01055 if (!$detectedType) { 01056 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token'; 01057 } 01058 01059 if ($detectedType == 'ms-token') { 01060 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3); 01061 } elseif ($detectedType == 'whitespaced') { 01062 $regA = array(); 01063 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA); 01064 $hexbyte = $regA[1]; 01065 $utf8 = 'U+' . $regA[2]; 01066 } 01067 $decval = hexdec(trim($hexbyte)); 01068 if ($decval > 127) { 01069 $utf8decval = hexdec(substr(trim($utf8), 2)); 01070 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval); 01071 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval; 01072 } 01073 } 01074 } 01075 if ($cacheFile) { 01076 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset])); 01077 } 01078 } 01079 return 2; 01080 } else { 01081 return FALSE; 01082 } 01083 } else { 01084 return 1; 01085 } 01086 } 01087 01097 function initUnicodeData($mode = NULL) { 01098 // cache files 01099 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 01100 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 01101 01102 // Only process if the tables are not yet loaded 01103 switch ($mode) { 01104 case 'case': 01105 if (is_array($this->caseFolding['utf-8'])) { 01106 return 1; 01107 } 01108 01109 // Use cached version if possible 01110 if ($cacheFileCase && @is_file($cacheFileCase)) { 01111 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 01112 return 2; 01113 } 01114 break; 01115 01116 case 'ascii': 01117 if (is_array($this->toASCII['utf-8'])) { 01118 return 1; 01119 } 01120 01121 // Use cached version if possible 01122 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 01123 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 01124 return 2; 01125 } 01126 break; 01127 } 01128 01129 // process main Unicode data file 01130 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt'; 01131 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) { 01132 return FALSE; 01133 } 01134 01135 $fh = fopen($unicodeDataFile, 'rb'); 01136 if (!$fh) { 01137 return FALSE; 01138 } 01139 01140 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 01141 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 01142 $this->caseFolding['utf-8'] = array(); 01143 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 01144 $utf8CaseFolding['toUpper'] = array(); 01145 $utf8CaseFolding['toLower'] = array(); 01146 $utf8CaseFolding['toTitle'] = array(); 01147 01148 $decomposition = array(); // array of temp. decompositions 01149 $mark = array(); // array of chars that are marks (eg. composing accents) 01150 $number = array(); // array of chars that are numbers (eg. digits) 01151 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 01152 01153 while (!feof($fh)) { 01154 $line = fgets($fh, 4096); 01155 // has a lot of info 01156 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line)); 01157 01158 $ord = hexdec($char); 01159 if ($ord > 0xFFFF) { 01160 break; 01161 } // only process the BMP 01162 01163 $utf8_char = $this->UnumberToChar($ord); 01164 01165 if ($upper) { 01166 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 01167 } 01168 if ($lower) { 01169 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 01170 } 01171 // store "title" only when different from "upper" (only a few) 01172 if ($title && $title != $upper) { 01173 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 01174 } 01175 01176 switch ($cat{0}) { 01177 case 'M': // mark (accent, umlaut, ...) 01178 $mark["U+$char"] = 1; 01179 break; 01180 01181 case 'N': // numeric value 01182 if ($ord > 0x80 && $num != '') { 01183 $number["U+$char"] = $num; 01184 } 01185 } 01186 01187 // accented Latin letters without "official" decomposition 01188 $match = array(); 01189 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) { 01190 $c = ord($match[2]); 01191 if ($match[1] == 'SMALL') { 01192 $c += 32; 01193 } 01194 01195 $decomposition["U+$char"] = array(dechex($c)); 01196 continue; 01197 } 01198 01199 $match = array(); 01200 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) { 01201 switch ($match[1]) { 01202 case '<circle>': // add parenthesis as circle replacement, eg (1) 01203 $match[2] = '0028 ' . $match[2] . ' 0029'; 01204 break; 01205 01206 case '<square>': // add square brackets as square replacement, eg [1] 01207 $match[2] = '005B ' . $match[2] . ' 005D'; 01208 break; 01209 01210 case '<compat>': // ignore multi char decompositions that start with a space 01211 if (preg_match('/^0020 /', $match[2])) { 01212 continue 2; 01213 } 01214 break; 01215 01216 // ignore Arabic and vertical layout presentation decomposition 01217 case '<initial>': 01218 case '<medial>': 01219 case '<final>': 01220 case '<isolated>': 01221 case '<vertical>': 01222 continue 2; 01223 } 01224 $decomposition["U+$char"] = explode(' ', $match[2]); 01225 } 01226 } 01227 fclose($fh); 01228 01229 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 01230 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt'; 01231 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 01232 $fh = fopen($specialCasingFile, 'rb'); 01233 if ($fh) { 01234 while (!feof($fh)) { 01235 $line = fgets($fh, 4096); 01236 if ($line{0} != '#' && trim($line) != '') { 01237 01238 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line); 01239 if ($cond == '' || $cond{0} == '#') { 01240 $utf8_char = $this->UnumberToChar(hexdec($char)); 01241 if ($char != $lower) { 01242 $arr = explode(' ', $lower); 01243 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01244 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr); 01245 } 01246 if ($char != $title && $title != $upper) { 01247 $arr = explode(' ', $title); 01248 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01249 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr); 01250 } 01251 if ($char != $upper) { 01252 $arr = explode(' ', $upper); 01253 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01254 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr); 01255 } 01256 } 01257 } 01258 } 01259 fclose($fh); 01260 } 01261 } 01262 01263 // process custom decompositions 01264 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt'; 01265 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 01266 $fh = fopen($customTranslitFile, 'rb'); 01267 if ($fh) { 01268 while (!feof($fh)) { 01269 $line = fgets($fh, 4096); 01270 if ($line{0} != '#' && trim($line) != '') { 01271 list($char, $translit) = t3lib_div::trimExplode(';', $line); 01272 if (!$translit) { 01273 $omit["U+$char"] = 1; 01274 } 01275 $decomposition["U+$char"] = explode(' ', $translit); 01276 01277 } 01278 } 01279 fclose($fh); 01280 } 01281 } 01282 01283 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 01284 foreach ($decomposition as $from => $to) { 01285 $code_decomp = array(); 01286 01287 while ($code_value = array_shift($to)) { 01288 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 01289 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) { 01290 array_unshift($to, $cv); 01291 } 01292 } elseif (!isset($mark["U+$code_value"])) { // remove mark 01293 array_push($code_decomp, $code_value); 01294 } 01295 } 01296 if (count($code_decomp) || isset($omit[$from])) { 01297 $decomposition[$from] = $code_decomp; 01298 } else { 01299 unset($decomposition[$from]); 01300 } 01301 } 01302 01303 // create ascii only mapping 01304 $this->toASCII['utf-8'] = array(); 01305 $ascii =& $this->toASCII['utf-8']; 01306 01307 foreach ($decomposition as $from => $to) { 01308 $code_decomp = array(); 01309 while ($code_value = array_shift($to)) { 01310 $ord = hexdec($code_value); 01311 if ($ord > 127) { 01312 continue 2; 01313 } // skip decompositions containing non-ASCII chars 01314 else 01315 { 01316 array_push($code_decomp, chr($ord)); 01317 } 01318 } 01319 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp); 01320 } 01321 01322 // add numeric decompositions 01323 foreach ($number as $from => $to) { 01324 $utf8_char = $this->UnumberToChar(hexdec($from)); 01325 if (!isset($ascii[$utf8_char])) { 01326 $ascii[$utf8_char] = $to; 01327 } 01328 } 01329 01330 if ($cacheFileCase) { 01331 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding)); 01332 } 01333 01334 if ($cacheFileASCII) { 01335 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii)); 01336 } 01337 01338 return 3; 01339 } 01340 01349 function initCaseFolding($charset) { 01350 // Only process if the case table is not yet loaded: 01351 if (is_array($this->caseFolding[$charset])) { 01352 return 1; 01353 } 01354 01355 // Use cached version if possible 01356 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl'); 01357 if ($cacheFile && @is_file($cacheFile)) { 01358 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01359 return 2; 01360 } 01361 01362 // init UTF-8 conversion for this charset 01363 if (!$this->initCharset($charset)) { 01364 return FALSE; 01365 } 01366 01367 // UTF-8 case folding is used as the base conversion table 01368 if (!$this->initUnicodeData('case')) { 01369 return FALSE; 01370 } 01371 01372 $nochar = chr($this->noCharByteVal); 01373 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01374 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01375 $c = $this->utf8_decode($utf8, $charset); 01376 01377 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 01378 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 01379 if ($cc != '' && $cc != $nochar) { 01380 $this->caseFolding[$charset]['toUpper'][$c] = $cc; 01381 } 01382 01383 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 01384 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 01385 if ($cc != '' && $cc != $nochar) { 01386 $this->caseFolding[$charset]['toLower'][$c] = $cc; 01387 } 01388 01389 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 01390 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 01391 if ($cc != '' && $cc != $nochar) { 01392 $this->caseFolding[$charset]['toTitle'][$c] = $cc; 01393 } 01394 } 01395 01396 // add the ASCII case table 01397 for ($i = ord('a'); $i <= ord('z'); $i++) { 01398 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32); 01399 } 01400 for ($i = ord('A'); $i <= ord('Z'); $i++) { 01401 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32); 01402 } 01403 01404 if ($cacheFile) { 01405 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset])); 01406 } 01407 01408 return 3; 01409 } 01410 01419 function initToASCII($charset) { 01420 // Only process if the case table is not yet loaded: 01421 if (is_array($this->toASCII[$charset])) { 01422 return 1; 01423 } 01424 01425 // Use cached version if possible 01426 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl'); 01427 if ($cacheFile && @is_file($cacheFile)) { 01428 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01429 return 2; 01430 } 01431 01432 // init UTF-8 conversion for this charset 01433 if (!$this->initCharset($charset)) { 01434 return FALSE; 01435 } 01436 01437 // UTF-8/ASCII transliteration is used as the base conversion table 01438 if (!$this->initUnicodeData('ascii')) { 01439 return FALSE; 01440 } 01441 01442 $nochar = chr($this->noCharByteVal); 01443 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01444 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01445 $c = $this->utf8_decode($utf8, $charset); 01446 01447 if (isset($this->toASCII['utf-8'][$utf8])) { 01448 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 01449 } 01450 } 01451 01452 if ($cacheFile) { 01453 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset])); 01454 } 01455 01456 return 3; 01457 } 01458 01459 01460 /******************************************** 01461 * 01462 * String operation functions 01463 * 01464 ********************************************/ 01465 01478 function substr($charset, $string, $start, $len = NULL) { 01479 if ($len === 0 || $string === '') { 01480 return ''; 01481 } 01482 01483 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01484 // cannot omit $len, when specifying charset 01485 if ($len == NULL) { 01486 $enc = mb_internal_encoding(); // save internal encoding 01487 mb_internal_encoding($charset); 01488 $str = mb_substr($string, $start); 01489 mb_internal_encoding($enc); // restore internal encoding 01490 01491 return $str; 01492 } 01493 else { 01494 return mb_substr($string, $start, $len, $charset); 01495 } 01496 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01497 // cannot omit $len, when specifying charset 01498 if ($len == NULL) { 01499 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding 01500 iconv_set_encoding('internal_encoding', $charset); 01501 $str = iconv_substr($string, $start); 01502 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding 01503 01504 return $str; 01505 } 01506 else { 01507 return iconv_substr($string, $start, $len, $charset); 01508 } 01509 } elseif ($charset == 'utf-8') { 01510 return $this->utf8_substr($string, $start, $len); 01511 } elseif ($this->eucBasedSets[$charset]) { 01512 return $this->euc_substr($string, $start, $charset, $len); 01513 } elseif ($this->twoByteSets[$charset]) { 01514 return substr($string, $start * 2, $len * 2); 01515 } elseif ($this->fourByteSets[$charset]) { 01516 return substr($string, $start * 4, $len * 4); 01517 } 01518 01519 // treat everything else as single-byte encoding 01520 return $len === NULL ? substr($string, $start) : substr($string, $start, $len); 01521 } 01522 01533 function strlen($charset, $string) { 01534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01535 return mb_strlen($string, $charset); 01536 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01537 return iconv_strlen($string, $charset); 01538 } elseif ($charset == 'utf-8') { 01539 return $this->utf8_strlen($string); 01540 } elseif ($this->eucBasedSets[$charset]) { 01541 return $this->euc_strlen($string, $charset); 01542 } elseif ($this->twoByteSets[$charset]) { 01543 return strlen($string) / 2; 01544 } elseif ($this->fourByteSets[$charset]) { 01545 return strlen($string) / 4; 01546 } 01547 // treat everything else as single-byte encoding 01548 return strlen($string); 01549 } 01550 01561 protected function cropMbstring($charset, $string, $len, $crop = '') { 01562 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) { 01563 return $string; 01564 } 01565 01566 if ($len > 0) { 01567 $string = mb_substr($string, 0, $len, $charset) . $crop; 01568 } else { 01569 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset); 01570 } 01571 01572 return $string; 01573 } 01574 01587 function crop($charset, $string, $len, $crop = '') { 01588 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01589 return $this->cropMbstring($charset, $string, $len, $crop); 01590 } 01591 01592 if (intval($len) == 0) { 01593 return $string; 01594 } 01595 01596 if ($charset == 'utf-8') { 01597 $i = $this->utf8_char2byte_pos($string, $len); 01598 } elseif ($this->eucBasedSets[$charset]) { 01599 $i = $this->euc_char2byte_pos($string, $len, $charset); 01600 } else { 01601 if ($len > 0) { 01602 $i = $len; 01603 } else { 01604 $i = strlen($string) + $len; 01605 if ($i <= 0) { 01606 $i = FALSE; 01607 } 01608 } 01609 } 01610 01611 if ($i === FALSE) { // $len outside actual string length 01612 return $string; 01613 } else { 01614 if ($len > 0) { 01615 if (strlen($string{$i})) { 01616 return substr($string, 0, $i) . $crop; 01617 01618 } 01619 } else { 01620 if (strlen($string{$i - 1})) { 01621 return $crop . substr($string, $i); 01622 } 01623 } 01624 01625 /* 01626 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 01627 if ($len > 0) { 01628 return substr($string,0,$i).$crop; 01629 } else { 01630 return $crop.substr($string,$i); 01631 } 01632 } 01633 */ 01634 } 01635 return $string; 01636 } 01637 01648 function strtrunc($charset, $string, $len) { 01649 if ($len <= 0) { 01650 return ''; 01651 } 01652 01653 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01654 return mb_strcut($string, 0, $len, $charset); 01655 } elseif ($charset == 'utf-8') { 01656 return $this->utf8_strtrunc($string, $len); 01657 } elseif ($this->eucBasedSets[$charset]) { 01658 return $this->euc_strtrunc($string, $len, $charset); 01659 } elseif ($this->twoByteSets[$charset]) { 01660 if ($len % 2) { 01661 $len--; 01662 } // don't cut at odd positions 01663 } elseif ($this->fourByteSets[$charset]) { 01664 $x = $len % 4; 01665 $len -= $x; // realign to position dividable by four 01666 } 01667 // treat everything else as single-byte encoding 01668 return substr($string, 0, $len); 01669 } 01670 01686 function conv_case($charset, $string, $case) { 01687 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01688 if ($case == 'toLower') { 01689 $string = mb_strtolower($string, $charset); 01690 } else { 01691 $string = mb_strtoupper($string, $charset); 01692 } 01693 } elseif ($charset == 'utf-8') { 01694 $string = $this->utf8_char_mapping($string, 'case', $case); 01695 } elseif (isset($this->eucBasedSets[$charset])) { 01696 $string = $this->euc_char_mapping($string, $charset, 'case', $case); 01697 } else { 01698 // treat everything else as single-byte encoding 01699 $string = $this->sb_char_mapping($string, $charset, 'case', $case); 01700 } 01701 01702 return $string; 01703 } 01704 01712 function specCharsToASCII($charset, $string) { 01713 if ($charset == 'utf-8') { 01714 $string = $this->utf8_char_mapping($string, 'ascii'); 01715 } elseif (isset($this->eucBasedSets[$charset])) { 01716 $string = $this->euc_char_mapping($string, $charset, 'ascii'); 01717 } else { 01718 // treat everything else as single-byte encoding 01719 $string = $this->sb_char_mapping($string, $charset, 'ascii'); 01720 } 01721 01722 return $string; 01723 } 01724 01725 01734 public function getPreferredClientLanguage($languageCodesList) { 01735 $allLanguageCodes = array(); 01736 $selectedLanguage = 'default'; 01737 01738 // get all languages where TYPO3 code is the same as the ISO code 01739 foreach ($this->charSetArray as $typo3Lang => $charSet) { 01740 $allLanguageCodes[$typo3Lang] = $typo3Lang; 01741 } 01742 01743 // get all languages where TYPO3 code differs from ISO code 01744 // or needs the country part 01745 // the iso codes will here overwrite the default typo3 language in the key 01746 foreach ($this->isoArray as $typo3Lang => $isoLang) { 01747 $isoLang = join('-', explode('_', $isoLang)); 01748 $allLanguageCodes[$typo3Lang] = $isoLang; 01749 } 01750 01751 // move the iso codes to the (because we're comparing the keys with "isset" later on) 01752 $allLanguageCodes = array_flip($allLanguageCodes); 01753 01754 01755 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList); 01756 // order the preferred languages after they key 01757 $sortedPreferredLanguages = array(); 01758 foreach ($preferredLanguages as $preferredLanguage) { 01759 $quality = 1.0; 01760 if (strpos($preferredLanguage, ';q=') !== FALSE) { 01761 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage); 01762 } 01763 $sortedPreferredLanguages[$preferredLanguage] = $quality; 01764 } 01765 01766 // loop through the languages, with the highest priority first 01767 arsort($sortedPreferredLanguages, SORT_NUMERIC); 01768 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) { 01769 if (isset($allLanguageCodes[$preferredLanguage])) { 01770 $selectedLanguage = $allLanguageCodes[$preferredLanguage]; 01771 break; 01772 } 01773 01774 // strip the country code from the end 01775 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage); 01776 if (isset($allLanguageCodes[$preferredLanguage])) { 01777 $selectedLanguage = $allLanguageCodes[$preferredLanguage]; 01778 break; 01779 } 01780 } 01781 if (!$selectedLanguage || $selectedLanguage == 'en') { 01782 $selectedLanguage = 'default'; 01783 } 01784 return $selectedLanguage; 01785 } 01786 01787 01788 /******************************************** 01789 * 01790 * Internal string operation functions 01791 * 01792 ********************************************/ 01793 01804 function sb_char_mapping($str, $charset, $mode, $opt = '') { 01805 switch ($mode) { 01806 case 'case': 01807 if (!$this->initCaseFolding($charset)) { 01808 return $str; 01809 } // do nothing 01810 $map =& $this->caseFolding[$charset][$opt]; 01811 break; 01812 01813 case 'ascii': 01814 if (!$this->initToASCII($charset)) { 01815 return $str; 01816 } // do nothing 01817 $map =& $this->toASCII[$charset]; 01818 break; 01819 01820 default: 01821 return $str; 01822 } 01823 01824 $out = ''; 01825 for ($i = 0; strlen($str{$i}); $i++) { 01826 $c = $str{$i}; 01827 if (isset($map[$c])) { 01828 $out .= $map[$c]; 01829 } else { 01830 $out .= $c; 01831 } 01832 } 01833 01834 return $out; 01835 } 01836 01837 01838 /******************************************** 01839 * 01840 * Internal UTF-8 string operation functions 01841 * 01842 ********************************************/ 01843 01855 function utf8_substr($str, $start, $len = NULL) { 01856 if (!strcmp($len, '0')) { 01857 return ''; 01858 } 01859 01860 $byte_start = $this->utf8_char2byte_pos($str, $start); 01861 if ($byte_start === FALSE) { 01862 if ($start > 0) { 01863 return FALSE; // $start outside string length 01864 } else { 01865 $start = 0; 01866 } 01867 } 01868 01869 $str = substr($str, $byte_start); 01870 01871 if ($len != NULL) { 01872 $byte_end = $this->utf8_char2byte_pos($str, $len); 01873 if ($byte_end === FALSE) // $len outside actual string length 01874 { 01875 return $len < 0 ? '' : $str; 01876 } // When length is less than zero and exceeds, then we return blank string. 01877 else 01878 { 01879 return substr($str, 0, $byte_end); 01880 } 01881 } 01882 else { 01883 return $str; 01884 } 01885 } 01886 01896 function utf8_strlen($str) { 01897 $n = 0; 01898 for ($i = 0; strlen($str{$i}); $i++) { 01899 $c = ord($str{$i}); 01900 if (!($c & 0x80)) // single-byte (0xxxxxx) 01901 { 01902 $n++; 01903 } 01904 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01905 { 01906 $n++; 01907 } 01908 } 01909 return $n; 01910 } 01911 01921 function utf8_strtrunc($str, $len) { 01922 $i = $len - 1; 01923 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 01924 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 01925 if ($i <= 0) { 01926 return ''; 01927 } // sanity check 01928 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 01929 if ($bc + $i > $len) { 01930 return substr($str, 0, $i); 01931 } 01932 // fallthru: multibyte char fits into length 01933 } 01934 return substr($str, 0, $len); 01935 } 01936 01947 function utf8_strpos($haystack, $needle, $offset = 0) { 01948 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01949 return mb_strpos($haystack, $needle, $offset, 'utf-8'); 01950 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01951 return iconv_strpos($haystack, $needle, $offset, 'utf-8'); 01952 } 01953 01954 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset); 01955 if ($byte_offset === FALSE) { 01956 return FALSE; 01957 } // offset beyond string length 01958 01959 $byte_pos = strpos($haystack, $needle, $byte_offset); 01960 if ($byte_pos === FALSE) { 01961 return FALSE; 01962 } // needle not found 01963 01964 return $this->utf8_byte2char_pos($haystack, $byte_pos); 01965 } 01966 01976 function utf8_strrpos($haystack, $needle) { 01977 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01978 return mb_strrpos($haystack, $needle, 'utf-8'); 01979 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01980 return iconv_strrpos($haystack, $needle, 'utf-8'); 01981 } 01982 01983 $byte_pos = strrpos($haystack, $needle); 01984 if ($byte_pos === FALSE) { 01985 return FALSE; 01986 } // needle not found 01987 01988 return $this->utf8_byte2char_pos($haystack, $byte_pos); 01989 } 01990 02000 function utf8_char2byte_pos($str, $pos) { 02001 $n = 0; // number of characters found 02002 $p = abs($pos); // number of characters wanted 02003 02004 if ($pos >= 0) { 02005 $i = 0; 02006 $d = 1; 02007 } else { 02008 $i = strlen($str) - 1; 02009 $d = -1; 02010 } 02011 02012 for (; strlen($str{$i}) && $n < $p; $i += $d) { 02013 $c = (int) ord($str{$i}); 02014 if (!($c & 0x80)) // single-byte (0xxxxxx) 02015 { 02016 $n++; 02017 } 02018 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 02019 { 02020 $n++; 02021 } 02022 } 02023 if (!strlen($str{$i})) { 02024 return FALSE; 02025 } // offset beyond string length 02026 02027 if ($pos >= 0) { 02028 // skip trailing multi-byte data bytes 02029 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { 02030 $i++; 02031 } 02032 } else { 02033 // correct offset 02034 $i++; 02035 } 02036 02037 return $i; 02038 } 02039 02049 function utf8_byte2char_pos($str, $pos) { 02050 $n = 0; // number of characters 02051 for ($i = $pos; $i > 0; $i--) { 02052 $c = (int) ord($str{$i}); 02053 if (!($c & 0x80)) // single-byte (0xxxxxx) 02054 { 02055 $n++; 02056 } 02057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 02058 { 02059 $n++; 02060 } 02061 } 02062 if (!strlen($str{$i})) { 02063 return FALSE; 02064 } // offset beyond string length 02065 02066 return $n; 02067 } 02068 02078 function utf8_char_mapping($str, $mode, $opt = '') { 02079 if (!$this->initUnicodeData($mode)) { 02080 return $str; 02081 } // do nothing 02082 02083 $out = ''; 02084 switch ($mode) { 02085 case 'case': 02086 $map =& $this->caseFolding['utf-8'][$opt]; 02087 break; 02088 02089 case 'ascii': 02090 $map =& $this->toASCII['utf-8']; 02091 break; 02092 02093 default: 02094 return $str; 02095 } 02096 02097 for ($i = 0; strlen($str{$i}); $i++) { 02098 $c = ord($str{$i}); 02099 if (!($c & 0x80)) // single-byte (0xxxxxx) 02100 { 02101 $mbc = $str{$i}; 02102 } 02103 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 02104 for ($bc = 0; $c & 0x80; $c = $c << 1) { 02105 $bc++; 02106 } // calculate number of bytes 02107 $mbc = substr($str, $i, $bc); 02108 $i += $bc - 1; 02109 } 02110 02111 if (isset($map[$mbc])) { 02112 $out .= $map[$mbc]; 02113 } else { 02114 $out .= $mbc; 02115 } 02116 } 02117 02118 return $out; 02119 } 02120 02121 02122 /******************************************** 02123 * 02124 * Internal EUC string operation functions 02125 * 02126 * Extended Unix Code: 02127 * ASCII compatible 7bit single bytes chars 02128 * 8bit two byte chars 02129 * 02130 * Shift-JIS is treated as a special case. 02131 * 02132 ********************************************/ 02133 02144 function euc_strtrunc($str, $len, $charset) { 02145 $sjis = ($charset == 'shift_jis'); 02146 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) { 02147 $c = ord($str{$i}); 02148 if ($sjis) { 02149 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 02150 $i++; 02151 } // advance a double-byte char 02152 } 02153 else { 02154 if ($c >= 0x80) { 02155 $i++; 02156 } // advance a double-byte char 02157 } 02158 } 02159 if (!strlen($str{$i})) { 02160 return $str; 02161 } // string shorter than supplied length 02162 02163 if ($i > $len) { 02164 return substr($str, 0, $len - 1); // we ended on a first byte 02165 } else { 02166 return substr($str, 0, $len); 02167 } 02168 } 02169 02180 function euc_substr($str, $start, $charset, $len = NULL) { 02181 $byte_start = $this->euc_char2byte_pos($str, $start, $charset); 02182 if ($byte_start === FALSE) { 02183 return FALSE; 02184 } // $start outside string length 02185 02186 $str = substr($str, $byte_start); 02187 02188 if ($len != NULL) { 02189 $byte_end = $this->euc_char2byte_pos($str, $len, $charset); 02190 if ($byte_end === FALSE) // $len outside actual string length 02191 { 02192 return $str; 02193 } 02194 else 02195 { 02196 return substr($str, 0, $byte_end); 02197 } 02198 } 02199 else { 02200 return $str; 02201 } 02202 } 02203 02213 function euc_strlen($str, $charset) { 02214 $sjis = ($charset == 'shift_jis'); 02215 $n = 0; 02216 for ($i = 0; strlen($str{$i}); $i++) { 02217 $c = ord($str{$i}); 02218 if ($sjis) { 02219 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 02220 $i++; 02221 } // advance a double-byte char 02222 } 02223 else { 02224 if ($c >= 0x80) { 02225 $i++; 02226 } // advance a double-byte char 02227 } 02228 02229 $n++; 02230 } 02231 02232 return $n; 02233 } 02234 02244 function euc_char2byte_pos($str, $pos, $charset) { 02245 $sjis = ($charset == 'shift_jis'); 02246 $n = 0; // number of characters seen 02247 $p = abs($pos); // number of characters wanted 02248 02249 if ($pos >= 0) { 02250 $i = 0; 02251 $d = 1; 02252 } else { 02253 $i = strlen($str) - 1; 02254 $d = -1; 02255 } 02256 02257 for (; strlen($str{$i}) && $n < $p; $i += $d) { 02258 $c = ord($str{$i}); 02259 if ($sjis) { 02260 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 02261 $i += $d; 02262 } // advance a double-byte char 02263 } 02264 else { 02265 if ($c >= 0x80) { 02266 $i += $d; 02267 } // advance a double-byte char 02268 } 02269 02270 $n++; 02271 } 02272 if (!strlen($str{$i})) { 02273 return FALSE; 02274 } // offset beyond string length 02275 02276 if ($pos < 0) { 02277 $i++; 02278 } // correct offset 02279 02280 return $i; 02281 } 02282 02293 function euc_char_mapping($str, $charset, $mode, $opt = '') { 02294 switch ($mode) { 02295 case 'case': 02296 if (!$this->initCaseFolding($charset)) { 02297 return $str; 02298 } // do nothing 02299 $map =& $this->caseFolding[$charset][$opt]; 02300 break; 02301 02302 case 'ascii': 02303 if (!$this->initToASCII($charset)) { 02304 return $str; 02305 } // do nothing 02306 $map =& $this->toASCII[$charset]; 02307 break; 02308 02309 default: 02310 return $str; 02311 } 02312 02313 $sjis = ($charset == 'shift_jis'); 02314 $out = ''; 02315 for ($i = 0; strlen($str{$i}); $i++) { 02316 $mbc = $str{$i}; 02317 $c = ord($mbc); 02318 02319 if ($sjis) { 02320 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 02321 $mbc = substr($str, $i, 2); 02322 $i++; 02323 } 02324 } 02325 else { 02326 if ($c >= 0x80) { // a double-byte char 02327 $mbc = substr($str, $i, 2); 02328 $i++; 02329 } 02330 } 02331 02332 if (isset($map[$mbc])) { 02333 $out .= $map[$mbc]; 02334 } else { 02335 $out .= $mbc; 02336 } 02337 } 02338 02339 return $out; 02340 } 02341 02342 } 02343 02344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) { 02345 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 02346 } 02347 02348 ?>