documentation/moodle/class_8t3lib__cs_8php_source.html

00001 <?php
00002 /***************************************************************
00003  *  Copyright notice
00004  *
00005  *  (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
00006  *  All rights reserved
00007  *
00008  *  This script is part of the Typo3 project. The Typo3 project is
00009  *  free software; you can redistribute it and/or modify
00010  *  it under the terms of the GNU General Public License as published by
00011  *  the Free Software Foundation; either version 2 of the License, or
00012  *  (at your option) any later version.
00013  *
00014  *  The GNU General Public License can be found at
00015  *  http://www.gnu.org/copyleft/gpl.html.
00016  *
00017  *  This script is distributed in the hope that it will be useful,
00018  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020  *  GNU General Public License for more details.
00021  *
00022  *  This copyright notice MUST APPEAR in all copies of the script!
00023  ***************************************************************/
00129 class t3lib_cs {
00130         var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
00131
00132                 // This is the array where parsed conversion tables are stored (cached)
00133         var $parsedCharsets = array();
00134
00135                 // An array where case folding data will be stored (cached)
00136         var $caseFolding = array();
00137
00138                 // An array where charset-to-ASCII mappings are stored (cached)
00139         var $toASCII = array();
00140
00141                 // This tells the converter which charsets has two bytes per char:
00142         var $twoByteSets = array(
00143                 'ucs-2' => 1, // 2-byte Unicode
00144         );
00145
00146                 // This tells the converter which charsets has four bytes per char:
00147         var $fourByteSets = array(
00148                 'ucs-4' => 1, // 4-byte Unicode
00149                 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
00150         );
00151
00152                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
00153         var $eucBasedSets = array(
00154                 'gb2312' => 1, // Chinese, simplified.
00155                 'big5' => 1, // Chinese, traditional.
00156                 'euc-kr' => 1, // Korean
00157                 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00158         );
00159
00160                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00161                 // http://czyborra.com/charsets/iso8859.html
00162         var $synonyms = array(
00163                 'us' => 'ascii',
00164                 'us-ascii' => 'ascii',
00165                 'cp819' => 'iso-8859-1',
00166                 'ibm819' => 'iso-8859-1',
00167                 'iso-ir-100' => 'iso-8859-1',
00168                 'iso-ir-101' => 'iso-8859-2',
00169                 'iso-ir-109' => 'iso-8859-3',
00170                 'iso-ir-110' => 'iso-8859-4',
00171                 'iso-ir-144' => 'iso-8859-5',
00172                 'iso-ir-127' => 'iso-8859-6',
00173                 'iso-ir-126' => 'iso-8859-7',
00174                 'iso-ir-138' => 'iso-8859-8',
00175                 'iso-ir-148' => 'iso-8859-9',
00176                 'iso-ir-157' => 'iso-8859-10',
00177                 'iso-ir-179' => 'iso-8859-13',
00178                 'iso-ir-199' => 'iso-8859-14',
00179                 'iso-ir-203' => 'iso-8859-15',
00180                 'csisolatin1' => 'iso-8859-1',
00181                 'csisolatin2' => 'iso-8859-2',
00182                 'csisolatin3' => 'iso-8859-3',
00183                 'csisolatin5' => 'iso-8859-9',
00184                 'csisolatin8' => 'iso-8859-14',
00185                 'csisolatin9' => 'iso-8859-15',
00186                 'csisolatingreek' => 'iso-8859-7',
00187                 'iso-celtic' => 'iso-8859-14',
00188                 'latin1' => 'iso-8859-1',
00189                 'latin2' => 'iso-8859-2',
00190                 'latin3' => 'iso-8859-3',
00191                 'latin5' => 'iso-8859-9',
00192                 'latin6' => 'iso-8859-10',
00193                 'latin8' => 'iso-8859-14',
00194                 'latin9' => 'iso-8859-15',
00195                 'l1' => 'iso-8859-1',
00196                 'l2' => 'iso-8859-2',
00197                 'l3' => 'iso-8859-3',
00198                 'l5' => 'iso-8859-9',
00199                 'l6' => 'iso-8859-10',
00200                 'l8' => 'iso-8859-14',
00201                 'l9' => 'iso-8859-15',
00202                 'cyrillic' => 'iso-8859-5',
00203                 'arabic' => 'iso-8859-6',
00204                 'tis-620' => 'iso-8859-11',
00205                 'win874' => 'windows-874',
00206                 'win1250' => 'windows-1250',
00207                 'win1251' => 'windows-1251',
00208                 'win1252' => 'windows-1252',
00209                 'win1253' => 'windows-1253',
00210                 'win1254' => 'windows-1254',
00211                 'win1255' => 'windows-1255',
00212                 'win1256' => 'windows-1256',
00213                 'win1257' => 'windows-1257',
00214                 'win1258' => 'windows-1258',
00215                 'cp1250' => 'windows-1250',
00216                 'cp1251' => 'windows-1251',
00217                 'cp1252' => 'windows-1252',
00218                 'ms-ee' => 'windows-1250',
00219                 'ms-ansi' => 'windows-1252',
00220                 'ms-greek' => 'windows-1253',
00221                 'ms-turk' => 'windows-1254',
00222                 'winbaltrim' => 'windows-1257',
00223                 'koi-8ru' => 'koi-8r',
00224                 'koi8r' => 'koi-8r',
00225                 'cp878' => 'koi-8r',
00226                 'mac' => 'macroman',
00227                 'macintosh' => 'macroman',
00228                 'euc-cn' => 'gb2312',
00229                 'x-euc-cn' => 'gb2312',
00230                 'euccn' => 'gb2312',
00231                 'cp936' => 'gb2312',
00232                 'big-5' => 'big5',
00233                 'cp950' => 'big5',
00234                 'eucjp' => 'euc-jp',
00235                 'sjis' => 'shift_jis',
00236                 'shift-jis' => 'shift_jis',
00237                 'cp932' => 'shift_jis',
00238                 'cp949' => 'euc-kr',
00239                 'utf7' => 'utf-7',
00240                 'utf8' => 'utf-8',
00241                 'utf16' => 'utf-16',
00242                 'utf32' => 'utf-32',
00243                 'utf8' => 'utf-8',
00244                 'ucs2' => 'ucs-2',
00245                 'ucs4' => 'ucs-4',
00246         );
00247
00248                 // mapping of iso-639-1 language codes to script names
00249         var $lang_to_script = array(
00250                         // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
00251                 'ar' => 'arabic',
00252                 'bg' => 'cyrillic', // Bulgarian
00253                 'bs' => 'east_european', // Bosnian
00254                 'cs' => 'east_european', // Czech
00255                 'da' => 'west_european', // Danish
00256                 'de' => 'west_european', // German
00257                 'es' => 'west_european', // Spanish
00258                 'et' => 'estonian',
00259                 'eo' => 'unicode', // Esperanto
00260                 'eu' => 'west_european', // Basque
00261                 'fa' => 'arabic', // Persian
00262                 'fi' => 'west_european', // Finish
00263                 'fo' => 'west_european', // Faroese
00264                 'fr' => 'west_european', // French
00265                 'ga' => 'west_european', // Irish
00266                 'gl' => 'west_european', // Galician
00267                 'gr' => 'greek',
00268                 'he' => 'hebrew', // Hebrew (since 1998)
00269                 'hi' => 'unicode', // Hindi
00270                 'hr' => 'east_european', // Croatian
00271                 'hu' => 'east_european', // Hungarian
00272                 'iw' => 'hebrew', // Hebrew (til 1998)
00273                 'is' => 'west_european', // Icelandic
00274                 'it' => 'west_european', // Italian
00275                 'ja' => 'japanese',
00276                 'ka' => 'unicode', // Georgian
00277                 'kl' => 'west_european', // Greenlandic
00278                 'km' => 'unicode', // Khmer
00279                 'ko' => 'korean',
00280                 'lt' => 'lithuanian',
00281                 'lv' => 'west_european', // Latvian/Lettish
00282                 'nl' => 'west_european', // Dutch
00283                 'no' => 'west_european', // Norwegian
00284                 'nb' => 'west_european', // Norwegian Bokmal
00285                 'nn' => 'west_european', // Norwegian Nynorsk
00286                 'pl' => 'east_european', // Polish
00287                 'pt' => 'west_european', // Portuguese
00288                 'ro' => 'east_european', // Romanian
00289                 'ru' => 'cyrillic', // Russian
00290                 'sk' => 'east_european', // Slovak
00291                 'sl' => 'east_european', // Slovenian
00292                 'sr' => 'cyrillic', // Serbian
00293                 'sv' => 'west_european', // Swedish
00294                 'sq' => 'albanian', // Albanian
00295                 'th' => 'thai',
00296                 'uk' => 'cyrillic', // Ukranian
00297                 'vi' => 'vietnamese',
00298                 'zh' => 'chinese',
00299                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00300                         // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
00301                 'ara' => 'arabic',
00302                 'bgr' => 'cyrillic', // Bulgarian
00303                 'cat' => 'west_european', // Catalan
00304                 'chs' => 'simpl_chinese',
00305                 'cht' => 'trad_chinese',
00306                 'csy' => 'east_european', // Czech
00307                 'dan' => 'west_european', // Danisch
00308                 'deu' => 'west_european', // German
00309                 'dea' => 'west_european', // German (Austrian)
00310                 'des' => 'west_european', // German (Swiss)
00311                 'ena' => 'west_european', // English (Australian)
00312                 'enc' => 'west_european', // English (Canadian)
00313                 'eng' => 'west_european', // English
00314                 'enz' => 'west_european', // English (New Zealand)
00315                 'enu' => 'west_european', // English (United States)
00316                 'euq' => 'west_european', // Basque
00317                 'fos' => 'west_european', // Faroese
00318                 'far' => 'arabic', // Persian
00319                 'fin' => 'west_european', // Finish
00320                 'fra' => 'west_european', // French
00321                 'frb' => 'west_european', // French (Belgian)
00322                 'frc' => 'west_european', // French (Canadian)
00323                 'frs' => 'west_european', // French (Swiss)
00324                 'geo' => 'unicode', // Georgian
00325                 'glg' => 'west_european', // Galician
00326                 'ell' => 'greek',
00327                 'heb' => 'hebrew',
00328                 'hin' => 'unicode', // Hindi
00329                 'hun' => 'east_european', // Hungarian
00330                 'isl' => 'west_euorpean', // Icelandic
00331                 'ita' => 'west_european', // Italian
00332                 'its' => 'west_european', // Italian (Swiss)
00333                 'jpn' => 'japanese',
00334                 'khm' => 'unicode', // Khmer
00335                 'kor' => 'korean',
00336                 'lth' => 'lithuanian',
00337                 'lvi' => 'west_european', // Latvian/Lettish
00338                 'msl' => 'west_european', // Malay
00339                 'nlb' => 'west_european', // Dutch (Belgian)
00340                 'nld' => 'west_european', // Dutch
00341                 'nor' => 'west_european', // Norwegian (bokmal)
00342                 'non' => 'west_european', // Norwegian (nynorsk)
00343                 'plk' => 'east_european', // Polish
00344                 'ptg' => 'west_european', // Portuguese
00345                 'ptb' => 'west_european', // Portuguese (Brazil)
00346                 'rom' => 'east_european', // Romanian
00347                 'rus' => 'cyrillic', // Russian
00348                 'slv' => 'east_european', // Slovenian
00349                 'sky' => 'east_european', // Slovak
00350                 'srl' => 'east_european', // Serbian (Latin)
00351                 'srb' => 'cyrillic', // Serbian (Cyrillic)
00352                 'esp' => 'west_european', // Spanish (trad. sort)
00353                 'esm' => 'west_european', // Spanish (Mexican)
00354                 'esn' => 'west_european', // Spanish (internat. sort)
00355                 'sve' => 'west_european', // Swedish
00356                 'sqi' => 'albanian', // Albanian
00357                 'tha' => 'thai',
00358                 'trk' => 'turkish',
00359                 'ukr' => 'cyrillic', // Ukrainian
00360                         // English language names
00361                 'albanian' => 'albanian',
00362                 'arabic' => 'arabic',
00363                 'basque' => 'west_european',
00364                 'bosnian' => 'east_european',
00365                 'bulgarian' => 'east_european',
00366                 'catalan' => 'west_european',
00367                 'croatian' => 'east_european',
00368                 'czech' => 'east_european',
00369                 'danish' => 'west_european',
00370                 'dutch' => 'west_european',
00371                 'english' => 'west_european',
00372                 'esperanto' => 'unicode',
00373                 'estonian' => 'estonian',
00374                 'faroese' => 'west_european',
00375                 'farsi' => 'arabic',
00376                 'finnish' => 'west_european',
00377                 'french' => 'west_european',
00378                 'galician' => 'west_european',
00379                 'georgian' => 'unicode',
00380                 'german' => 'west_european',
00381                 'greek' => 'greek',
00382                 'greenlandic' => 'west_european',
00383                 'hebrew' => 'hebrew',
00384                 'hindi' => 'unicode',
00385                 'hungarian' => 'east_european',
00386                 'icelandic' => 'west_european',
00387                 'italian' => 'west_european',
00388                 'khmer' => 'unicode',
00389                 'latvian' => 'west_european',
00390                 'lettish' => 'west_european',
00391                 'lithuanian' => 'lithuanian',
00392                 'malay' => 'west_european',
00393                 'norwegian' => 'west_european',
00394                 'persian' => 'arabic',
00395                 'polish' => 'east_european',
00396                 'portuguese' => 'west_european',
00397                 'russian' => 'cyrillic',
00398                 'romanian' => 'east_european',
00399                 'serbian' => 'cyrillic',
00400                 'slovak' => 'east_european',
00401                 'slovenian' => 'east_european',
00402                 'spanish' => 'west_european',
00403                 'svedish' => 'west_european',
00404                 'that' => 'thai',
00405                 'turkish' => 'turkish',
00406                 'ukrainian' => 'cyrillic',
00407         );
00408
00409                 // mapping of language (family) names to charsets on Unix
00410         var $script_to_charset_unix = array(
00411                 'west_european' => 'iso-8859-1',
00412                 'estonian' => 'iso-8859-1',
00413                 'east_european' => 'iso-8859-2',
00414                 'baltic' => 'iso-8859-4',
00415                 'cyrillic' => 'iso-8859-5',
00416                 'arabic' => 'iso-8859-6',
00417                 'greek' => 'iso-8859-7',
00418                 'hebrew' => 'iso-8859-8',
00419                 'turkish' => 'iso-8859-9',
00420                 'thai' => 'iso-8859-11', // = TIS-620
00421                 'lithuanian' => 'iso-8859-13',
00422                 'chinese' => 'gb2312', // = euc-cn
00423                 'japanese' => 'euc-jp',
00424                 'korean' => 'euc-kr',
00425                 'simpl_chinese' => 'gb2312',
00426                 'trad_chinese' => 'big5',
00427                 'vietnamese' => '',
00428                 'unicode' => 'utf-8',
00429                 'albanian' => 'utf-8'
00430         );
00431
00432                 // mapping of language (family) names to charsets on Windows
00433         var $script_to_charset_windows = array(
00434                 'east_european' => 'windows-1250',
00435                 'cyrillic' => 'windows-1251',
00436                 'west_european' => 'windows-1252',
00437                 'greek' => 'windows-1253',
00438                 'turkish' => 'windows-1254',
00439                 'hebrew' => 'windows-1255',
00440                 'arabic' => 'windows-1256',
00441                 'baltic' => 'windows-1257',
00442                 'estonian' => 'windows-1257',
00443                 'lithuanian' => 'windows-1257',
00444                 'vietnamese' => 'windows-1258',
00445                 'thai' => 'cp874',
00446                 'korean' => 'cp949',
00447                 'chinese' => 'gb2312',
00448                 'japanese' => 'shift_jis',
00449                 'simpl_chinese' => 'gb2312',
00450                 'trad_chinese' => 'big5',
00451                 'albanian' => 'windows-1250',
00452                 'unicode' => 'utf-8'
00453         );
00454
00455                 // mapping of locale names to charsets
00456         var $locale_to_charset = array(
00457                 'japanese.euc' => 'euc-jp',
00458                 'ja_jp.ujis' => 'euc-jp',
00459                 'korean.euc' => 'euc-kr',
00460                 'sr@Latn' => 'iso-8859-2',
00461                 'zh_cn' => 'gb2312',
00462                 'zh_hk' => 'big5',
00463                 'zh_tw' => 'big5',
00464         );
00465
00466                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00467                 // Empty values means "iso-8859-1"
00468         var $charSetArray = array(
00469                 'dk' => '',
00470                 'de' => '',
00471                 'no' => '',
00472                 'it' => '',
00473                 'fr' => '',
00474                 'es' => '',
00475                 'nl' => '',
00476                 'cz' => 'windows-1250',
00477                 'pl' => 'iso-8859-2',
00478                 'si' => 'windows-1250',
00479                 'fi' => '',
00480                 'tr' => 'iso-8859-9',
00481                 'se' => '',
00482                 'pt' => '',
00483                 'ru' => 'windows-1251',
00484                 'ro' => 'iso-8859-2',
00485                 'ch' => 'gb2312',
00486                 'sk' => 'windows-1250',
00487                 'lt' => 'windows-1257',
00488                 'is' => 'utf-8',
00489                 'hr' => 'windows-1250',
00490                 'hu' => 'iso-8859-2',
00491                 'gl' => '',
00492                 'th' => 'iso-8859-11',
00493                 'gr' => 'iso-8859-7',
00494                 'hk' => 'big5',
00495                 'eu' => '',
00496                 'bg' => 'windows-1251',
00497                 'br' => '',
00498                 'et' => 'iso-8859-4',
00499                 'ar' => 'iso-8859-6',
00500                 'he' => 'utf-8',
00501                 'ua' => 'windows-1251',
00502                 'jp' => 'shift_jis',
00503                 'lv' => 'utf-8',
00504                 'vn' => 'utf-8',
00505                 'ca' => 'iso-8859-15',
00506                 'ba' => 'iso-8859-2',
00507                 'kr' => 'euc-kr',
00508                 'eo' => 'utf-8',
00509                 'my' => '',
00510                 'hi' => 'utf-8',
00511                 'fo' => 'utf-8',
00512                 'fa' => 'utf-8',
00513                 'sr' => 'utf-8',
00514                 'sq' => 'utf-8',
00515                 'ge' => 'utf-8',
00516                 'ga' => '',
00517                 'km' => 'utf-8',
00518                 'qc' => '',
00519         );
00520
00521                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00522                 // Missing keys means: same as Typo3
00523         var $isoArray = array(
00524                 'ba' => 'bs',
00525                 'br' => 'pt_BR',
00526                 'ch' => 'zh_CN',
00527                 'cz' => 'cs',
00528                 'dk' => 'da',
00529                 'si' => 'sl',
00530                 'se' => 'sv',
00531                 'gl' => 'kl',
00532                 'gr' => 'el',
00533                 'hk' => 'zh_HK',
00534                 'kr' => 'ko',
00535                 'ua' => 'uk',
00536                 'jp' => 'ja',
00537                 'qc' => 'fr_CA',
00538                 'vn' => 'vi',
00539                 'ge' => 'ka',
00540                 'ga' => 'gl',
00541         );
00542
00550         function parse_charset($charset) {
00551                 $charset = trim(strtolower($charset));
00552                 if (isset($this->synonyms[$charset])) {
00553                         $charset = $this->synonyms[$charset];
00554                 }
00555
00556                 return $charset;
00557         }
00558
00571         function get_locale_charset($locale) {
00572                 $locale = strtolower($locale);
00573
00574                         // exact locale specific charset?
00575                 if (isset($this->locale_to_charset[$locale])) {
00576                         return $this->locale_to_charset[$locale];
00577                 }
00578
00579                         // get modifier
00580                 list($locale, $modifier) = explode('@', $locale);
00581
00582                         // locale contains charset: use it
00583                 list($locale, $charset) = explode('.', $locale);
00584                 if ($charset) {
00585                         return $this->parse_charset($charset);
00586                 }
00587
00588                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00589                 if ($modifier == 'euro') {
00590                         return 'iso-8859-15';
00591                 }
00592
00593                         // get language
00594                 list($language, $country) = explode('_', $locale);
00595                 if (isset($this->lang_to_script[$language])) {
00596                         $script = $this->lang_to_script[$language];
00597                 }
00598
00599                 if (TYPO3_OS == 'WIN') {
00600                         $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
00601                 } else {
00602                         $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
00603                 }
00604
00605                 return $cs;
00606         }
00607
00608
00609         /********************************************
00610          *
00611          * Charset Conversion functions
00612          *
00613          ********************************************/
00614
00625         function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
00626                 if ($fromCS == $toCS) {
00627                         return $str;
00628                 }
00629
00630                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00631                 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
00632                         switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
00633                                 case 'mbstring':
00634                                         $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
00635                                         if (FALSE !== $conv_str) {
00636                                                 return $conv_str;
00637                                         } // returns false for unsupported charsets
00638                                         break;
00639
00640                                 case 'iconv':
00641                                         $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
00642                                         if (FALSE !== $conv_str) {
00643                                                 return $conv_str;
00644                                         }
00645                                         break;
00646
00647                                 case 'recode':
00648                                         $conv_str = recode_string($fromCS . '..' . $toCS, $str);
00649                                         if (FALSE !== $conv_str) {
00650                                                 return $conv_str;
00651                                         }
00652                                         break;
00653                         }
00654                         // fallback to TYPO3 conversion
00655                 }
00656
00657                 if ($fromCS != 'utf-8') {
00658                         $str = $this->utf8_encode($str, $fromCS);
00659                 }
00660                 if ($toCS != 'utf-8') {
00661                         $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
00662                 }
00663                 return $str;
00664         }
00665
00677         function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
00678                 foreach ($array as $key => $value) {
00679                         if (is_array($array[$key])) {
00680                                 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
00681                         } elseif (is_string($array[$key])) {
00682                                 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
00683                         }
00684                 }
00685         }
00686
00694         function utf8_encode($str, $charset) {
00695
00696                 if ($charset === 'utf-8') {
00697                         return $str;
00698                 }
00699
00700                         // Charset is case-insensitive.
00701                 if ($this->initCharset($charset)) { // Parse conv. table if not already...
00702                         $strLen = strlen($str);
00703                         $outStr = '';
00704
00705                         for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
00706                                 $chr = substr($str, $a, 1);
00707                                 $ord = ord($chr);
00708                                 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
00709                                         $ord2 = ord($str{$a + 1});
00710                                         $ord = $ord << 8 | $ord2; // assume big endian
00711
00712                                         if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00713                                                 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
00714                                         } else {
00715                                                 $outStr .= chr($this->noCharByteVal);
00716                                         } // No char exists
00717                                         $a++;
00718                                 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
00719                                         if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00720                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
00721                                                         $a++;
00722                                                         $ord2 = ord(substr($str, $a, 1));
00723                                                         $ord = $ord * 256 + $ord2;
00724                                                 }
00725                                         }
00726
00727                                         if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00728                                                 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
00729                                         } else {
00730                                                 $outStr .= chr($this->noCharByteVal);
00731                                         } // No char exists
00732                                 } else {
00733                                         $outStr .= $chr;
00734                                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00735                         }
00736                         return $outStr;
00737                 }
00738         }
00739
00748         function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
00749
00750                 if ($charset === 'utf-8') {
00751                         return $str;
00752                 }
00753
00754                         // Charset is case-insensitive.
00755                 if ($this->initCharset($charset)) { // Parse conv. table if not already...
00756                         $strLen = strlen($str);
00757                         $outStr = '';
00758                         $buf = '';
00759                         for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
00760                                 $chr = substr($str, $a, 1);
00761                                 $ord = ord($chr);
00762                                 if ($ord > 127) { // This means multibyte! (first byte!)
00763                                         if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00764
00765                                                 $buf = $chr; // Add first byte
00766                                                 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00767                                                         $ord = $ord << 1; // Shift it left and ...
00768                                                         if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00769                                                                 $a++; // Increase pointer...
00770                                                                 $buf .= substr($str, $a, 1); // ... and add the next char.
00771                                                         } else {
00772                                                                 break;
00773                                                         }
00774                                                 }
00775
00776                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
00777                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
00778                                                         if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00779                                                                 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
00780                                                         } else {
00781                                                                 $outStr .= chr($mByte);
00782                                                         }
00783                                                 } elseif ($useEntityForNoChar) { // Create num entity:
00784                                                         $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
00785                                                 } else {
00786                                                         $outStr .= chr($this->noCharByteVal);
00787                                                 } // No char exists
00788                                         } else {
00789                                                 $outStr .= chr($this->noCharByteVal);
00790                                         } // No char exists (MIDDLE of MB sequence!)
00791                                 } else {
00792                                         $outStr .= $chr;
00793                                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00794                         }
00795                         return $outStr;
00796                 }
00797         }
00798
00805         function utf8_to_entities($str) {
00806                 $strLen = strlen($str);
00807                 $outStr = '';
00808                 $buf = '';
00809                 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
00810                         $chr = substr($str, $a, 1);
00811                         $ord = ord($chr);
00812                         if ($ord > 127) { // This means multibyte! (first byte!)
00813                                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00814                                         $buf = $chr; // Add first byte
00815                                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00816                                                 $ord = $ord << 1; // Shift it left and ...
00817                                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00818                                                         $a++; // Increase pointer...
00819                                                         $buf .= substr($str, $a, 1); // ... and add the next char.
00820                                                 } else {
00821                                                         break;
00822                                                 }
00823                                         }
00824
00825                                         $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
00826                                 } else {
00827                                         $outStr .= chr($this->noCharByteVal);
00828                                 } // No char exists (MIDDLE of MB sequence!)
00829                         } else {
00830                                 $outStr .= $chr;
00831                         } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00832                 }
00833
00834                 return $outStr;
00835         }
00836
00844         function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
00845                 if ($alsoStdHtmlEnt) {
00846                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
00847                 }
00848
00849                 $token = md5(microtime());
00850                 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
00851                 foreach ($parts as $k => $v) {
00852                         if ($k % 2) {
00853                                 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
00854                                         if (substr($v, 1, 1) == 'x') {
00855                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
00856                                         } else {
00857                                                 $parts[$k] = $this->UnumberToChar(substr($v, 1));
00858                                         }
00859                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
00860                                         $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
00861                                 } else { // No conversion:
00862                                         $parts[$k] = '&' . $v . ';';
00863                                 }
00864                         }
00865                 }
00866
00867                 return implode('', $parts);
00868         }
00869
00878         function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
00879                         // If entities must be registered as well...:
00880                 if ($convEntities) {
00881                         $str = $this->entities_to_utf8($str, 1);
00882                 }
00883                         // Do conversion:
00884                 $strLen = strlen($str);
00885                 $outArr = array();
00886                 $buf = '';
00887                 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
00888                         $chr = substr($str, $a, 1);
00889                         $ord = ord($chr);
00890                         if ($ord > 127) { // This means multibyte! (first byte!)
00891                                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00892                                         $buf = $chr; // Add first byte
00893                                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00894                                                 $ord = $ord << 1; // Shift it left and ...
00895                                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00896                                                         $a++; // Increase pointer...
00897                                                         $buf .= substr($str, $a, 1); // ... and add the next char.
00898                                                 } else {
00899                                                         break;
00900                                                 }
00901                                         }
00902
00903                                         $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
00904                                 } else {
00905                                         $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
00906                                 } // No char exists (MIDDLE of MB sequence!)
00907                         } else {
00908                                 $outArr[] = $retChar ? chr($ord) : $ord;
00909                         } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00910                 }
00911
00912                 return $outArr;
00913         }
00914
00934         function UnumberToChar($cbyte) {
00935                 $str = '';
00936
00937                 if ($cbyte < 0x80) {
00938                         $str .= chr($cbyte);
00939                 } else {
00940                         if ($cbyte < 0x800) {
00941                                 $str .= chr(0xC0 | ($cbyte >> 6));
00942                                 $str .= chr(0x80 | ($cbyte & 0x3F));
00943                         } else {
00944                                 if ($cbyte < 0x10000) {
00945                                         $str .= chr(0xE0 | ($cbyte >> 12));
00946                                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00947                                         $str .= chr(0x80 | ($cbyte & 0x3F));
00948                                 } else {
00949                                         if ($cbyte < 0x200000) {
00950                                                 $str .= chr(0xF0 | ($cbyte >> 18));
00951                                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
00952                                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00953                                                 $str .= chr(0x80 | ($cbyte & 0x3F));
00954                                         } else {
00955                                                 if ($cbyte < 0x4000000) {
00956                                                         $str .= chr(0xF8 | ($cbyte >> 24));
00957                                                         $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
00958                                                         $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
00959                                                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00960                                                         $str .= chr(0x80 | ($cbyte & 0x3F));
00961                                                 } else {
00962                                                         if ($cbyte < 0x80000000) {
00963                                                                 $str .= chr(0xFC | ($cbyte >> 30));
00964                                                                 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
00965                                                                 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
00966                                                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
00967                                                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
00968                                                                 $str .= chr(0x80 | ($cbyte & 0x3F));
00969                                                         } else { // Cannot express a 32-bit character in UTF-8
00970                                                                 $str .= chr($this->noCharByteVal);
00971                                                         }
00972                                                 }
00973                                         }
00974                                 }
00975                         }
00976                 }
00977                 return $str;
00978         }
00979
00989         function utf8CharToUnumber($str, $hex = 0) {
00990                 $ord = ord(substr($str, 0, 1)); // First char
00991
00992                 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
00993                         $binBuf = '';
00994                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
00995                                 $ord = $ord << 1; // Shift it left and ...
00996                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00997                                         $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
00998                                 } else {
00999                                         break;
01000                                 }
01001                         }
01002                         $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
01003
01004                         $int = bindec($binBuf);
01005                 } else {
01006                         $int = $ord;
01007                 }
01008
01009                 return $hex ? 'x' . dechex($int) : $int;
01010         }
01011
01012
01013         /********************************************
01014          *
01015          * Init functions
01016          *
01017          ********************************************/
01018
01029         function initCharset($charset) {
01030                         // Only process if the charset is not yet loaded:
01031                 if (empty($this->parsedCharsets[$charset]) || !is_array($this->parsedCharsets[$charset])) {
01032
01033                                 // Conversion table filename:
01034                         $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
01035
01036                                 // If the conversion table is found:
01037                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
01038                                         // Cache file for charsets:
01039                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
01040                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
01041                                 if ($cacheFile && @is_file($cacheFile)) {
01042                                         $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01043                                 } else {
01044                                                 // Parse conversion table into lines:
01045                                         $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
01046                                                 // Initialize the internal variable holding the conv. table:
01047                                         $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
01048                                                 // traverse the lines:
01049                                         $detectedType = '';
01050                                         foreach ($lines as $value) {
01051                                                 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
01052
01053                                                                 // Detect type if not done yet: (Done on first real line)
01054                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
01055                                                         if (!$detectedType) {
01056                                                                 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
01057                                                         }
01058
01059                                                         if ($detectedType == 'ms-token') {
01060                                                                 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
01061                                                         } elseif ($detectedType == 'whitespaced') {
01062                                                                 $regA = array();
01063                                                                 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
01064                                                                 $hexbyte = $regA[1];
01065                                                                 $utf8 = 'U+' . $regA[2];
01066                                                         }
01067                                                         $decval = hexdec(trim($hexbyte));
01068                                                         if ($decval > 127) {
01069                                                                 $utf8decval = hexdec(substr(trim($utf8), 2));
01070                                                                 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
01071                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
01072                                                         }
01073                                                 }
01074                                         }
01075                                         if ($cacheFile) {
01076                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
01077                                         }
01078                                 }
01079                                 return 2;
01080                         } else {
01081                                 return FALSE;
01082                         }
01083                 } else {
01084                         return 1;
01085                 }
01086         }
01087
01097         function initUnicodeData($mode = NULL) {
01098                         // cache files
01099                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
01100                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
01101
01102                         // Only process if the tables are not yet loaded
01103                 switch ($mode) {
01104                         case 'case':
01105                                 if (is_array($this->caseFolding['utf-8'])) {
01106                                         return 1;
01107                                 }
01108
01109                                         // Use cached version if possible
01110                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
01111                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01112                                         return 2;
01113                                 }
01114                                 break;
01115
01116                         case 'ascii':
01117                                 if (is_array($this->toASCII['utf-8'])) {
01118                                         return 1;
01119                                 }
01120
01121                                         // Use cached version if possible
01122                                 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
01123                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01124                                         return 2;
01125                                 }
01126                                 break;
01127                 }
01128
01129                         // process main Unicode data file
01130                 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
01131                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
01132                         return FALSE;
01133                 }
01134
01135                 $fh = fopen($unicodeDataFile, 'rb');
01136                 if (!$fh) {
01137                         return FALSE;
01138                 }
01139
01140                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01141                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01142                 $this->caseFolding['utf-8'] = array();
01143                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01144                 $utf8CaseFolding['toUpper'] = array();
01145                 $utf8CaseFolding['toLower'] = array();
01146                 $utf8CaseFolding['toTitle'] = array();
01147
01148                 $decomposition = array(); // array of temp. decompositions
01149                 $mark = array(); // array of chars that are marks (eg. composing accents)
01150                 $number = array(); // array of chars that are numbers (eg. digits)
01151                 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
01152
01153                 while (!feof($fh)) {
01154                         $line = fgets($fh, 4096);
01155                                 // has a lot of info
01156                         list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
01157
01158                         $ord = hexdec($char);
01159                         if ($ord > 0xFFFF) {
01160                                 break;
01161                         } // only process the BMP
01162
01163                         $utf8_char = $this->UnumberToChar($ord);
01164
01165                         if ($upper) {
01166                                 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01167                         }
01168                         if ($lower) {
01169                                 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01170                         }
01171                                 // store "title" only when different from "upper" (only a few)
01172                         if ($title && $title != $upper) {
01173                                 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01174                         }
01175
01176                         switch ($cat{0}) {
01177                                 case 'M': // mark (accent, umlaut, ...)
01178                                         $mark["U+$char"] = 1;
01179                                         break;
01180
01181                                 case 'N': // numeric value
01182                                         if ($ord > 0x80 && $num != '') {
01183                                                 $number["U+$char"] = $num;
01184                                         }
01185                         }
01186
01187                                 // accented Latin letters without "official" decomposition
01188                         $match = array();
01189                         if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
01190                                 $c = ord($match[2]);
01191                                 if ($match[1] == 'SMALL') {
01192                                         $c += 32;
01193                                 }
01194
01195                                 $decomposition["U+$char"] = array(dechex($c));
01196                                 continue;
01197                         }
01198
01199                         $match = array();
01200                         if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
01201                                 switch ($match[1]) {
01202                                         case '<circle>': // add parenthesis as circle replacement, eg (1)
01203                                                 $match[2] = '0028 ' . $match[2] . ' 0029';
01204                                                 break;
01205
01206                                         case '<square>': // add square brackets as square replacement, eg [1]
01207                                                 $match[2] = '005B ' . $match[2] . ' 005D';
01208                                                 break;
01209
01210                                         case '<compat>': // ignore multi char decompositions that start with a space
01211                                                 if (preg_match('/^0020 /', $match[2])) {
01212                                                         continue 2;
01213                                                 }
01214                                                 break;
01215
01216                                                 // ignore Arabic and vertical layout presentation decomposition
01217                                         case '<initial>':
01218                                         case '<medial>':
01219                                         case '<final>':
01220                                         case '<isolated>':
01221                                         case '<vertical>':
01222                                                 continue 2;
01223                                 }
01224                                 $decomposition["U+$char"] = explode(' ', $match[2]);
01225                         }
01226                 }
01227                 fclose($fh);
01228
01229                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01230                 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
01231                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
01232                         $fh = fopen($specialCasingFile, 'rb');
01233                         if ($fh) {
01234                                 while (!feof($fh)) {
01235                                         $line = fgets($fh, 4096);
01236                                         if ($line{0} != '#' && trim($line) != '') {
01237
01238                                                 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
01239                                                 if ($cond == '' || $cond{0} == '#') {
01240                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
01241                                                         if ($char != $lower) {
01242                                                                 $arr = explode(' ', $lower);
01243                                                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01244                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
01245                                                         }
01246                                                         if ($char != $title && $title != $upper) {
01247                                                                 $arr = explode(' ', $title);
01248                                                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01249                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
01250                                                         }
01251                                                         if ($char != $upper) {
01252                                                                 $arr = explode(' ', $upper);
01253                                                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01254                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
01255                                                         }
01256                                                 }
01257                                         }
01258                                 }
01259                                 fclose($fh);
01260                         }
01261                 }
01262
01263                         // process custom decompositions
01264                 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
01265                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
01266                         $fh = fopen($customTranslitFile, 'rb');
01267                         if ($fh) {
01268                                 while (!feof($fh)) {
01269                                         $line = fgets($fh, 4096);
01270                                         if ($line{0} != '#' && trim($line) != '') {
01271                                                 list($char, $translit) = t3lib_div::trimExplode(';', $line);
01272                                                 if (!$translit) {
01273                                                         $omit["U+$char"] = 1;
01274                                                 }
01275                                                 $decomposition["U+$char"] = explode(' ', $translit);
01276
01277                                         }
01278                                 }
01279                                 fclose($fh);
01280                         }
01281                 }
01282
01283                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01284                 foreach ($decomposition as $from => $to) {
01285                         $code_decomp = array();
01286
01287                         while ($code_value = array_shift($to)) {
01288                                 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
01289                                         foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
01290                                                 array_unshift($to, $cv);
01291                                         }
01292                                 } elseif (!isset($mark["U+$code_value"])) { // remove mark
01293                                         array_push($code_decomp, $code_value);
01294                                 }
01295                         }
01296                         if (count($code_decomp) || isset($omit[$from])) {
01297                                 $decomposition[$from] = $code_decomp;
01298                         } else {
01299                                 unset($decomposition[$from]);
01300                         }
01301                 }
01302
01303                         // create ascii only mapping
01304                 $this->toASCII['utf-8'] = array();
01305                 $ascii =& $this->toASCII['utf-8'];
01306
01307                 foreach ($decomposition as $from => $to) {
01308                         $code_decomp = array();
01309                         while ($code_value = array_shift($to)) {
01310                                 $ord = hexdec($code_value);
01311                                 if ($ord > 127) {
01312                                         continue 2;
01313                                 } // skip decompositions containing non-ASCII chars
01314                                 else
01315                                 {
01316                                         array_push($code_decomp, chr($ord));
01317                                 }
01318                         }
01319                         $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
01320                 }
01321
01322                         // add numeric decompositions
01323                 foreach ($number as $from => $to) {
01324                         $utf8_char = $this->UnumberToChar(hexdec($from));
01325                         if (!isset($ascii[$utf8_char])) {
01326                                 $ascii[$utf8_char] = $to;
01327                         }
01328                 }
01329
01330                 if ($cacheFileCase) {
01331                         t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
01332                 }
01333
01334                 if ($cacheFileASCII) {
01335                         t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
01336                 }
01337
01338                 return 3;
01339         }
01340
01349         function initCaseFolding($charset) {
01350                         // Only process if the case table is not yet loaded:
01351                 if (is_array($this->caseFolding[$charset])) {
01352                         return 1;
01353                 }
01354
01355                         // Use cached version if possible
01356                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
01357                 if ($cacheFile && @is_file($cacheFile)) {
01358                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01359                         return 2;
01360                 }
01361
01362                         // init UTF-8 conversion for this charset
01363                 if (!$this->initCharset($charset)) {
01364                         return FALSE;
01365                 }
01366
01367                         // UTF-8 case folding is used as the base conversion table
01368                 if (!$this->initUnicodeData('case')) {
01369                         return FALSE;
01370                 }
01371
01372                 $nochar = chr($this->noCharByteVal);
01373                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01374                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01375                         $c = $this->utf8_decode($utf8, $charset);
01376
01377                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01378                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01379                         if ($cc != '' && $cc != $nochar) {
01380                                 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01381                         }
01382
01383                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01384                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01385                         if ($cc != '' && $cc != $nochar) {
01386                                 $this->caseFolding[$charset]['toLower'][$c] = $cc;
01387                         }
01388
01389                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01390                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01391                         if ($cc != '' && $cc != $nochar) {
01392                                 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01393                         }
01394                 }
01395
01396                         // add the ASCII case table
01397                 for ($i = ord('a'); $i <= ord('z'); $i++) {
01398                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
01399                 }
01400                 for ($i = ord('A'); $i <= ord('Z'); $i++) {
01401                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
01402                 }
01403
01404                 if ($cacheFile) {
01405                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
01406                 }
01407
01408                 return 3;
01409         }
01410
01419         function initToASCII($charset) {
01420                         // Only process if the case table is not yet loaded:
01421                 if (is_array($this->toASCII[$charset])) {
01422                         return 1;
01423                 }
01424
01425                         // Use cached version if possible
01426                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
01427                 if ($cacheFile && @is_file($cacheFile)) {
01428                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01429                         return 2;
01430                 }
01431
01432                         // init UTF-8 conversion for this charset
01433                 if (!$this->initCharset($charset)) {
01434                         return FALSE;
01435                 }
01436
01437                         // UTF-8/ASCII transliteration is used as the base conversion table
01438                 if (!$this->initUnicodeData('ascii')) {
01439                         return FALSE;
01440                 }
01441
01442                 $nochar = chr($this->noCharByteVal);
01443                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01444                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01445                         $c = $this->utf8_decode($utf8, $charset);
01446
01447                         if (isset($this->toASCII['utf-8'][$utf8])) {
01448                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01449                         }
01450                 }
01451
01452                 if ($cacheFile) {
01453                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
01454                 }
01455
01456                 return 3;
01457         }
01458
01459
01460         /********************************************
01461          *
01462          * String operation functions
01463          *
01464          ********************************************/
01465
01478         function substr($charset, $string, $start, $len = NULL) {
01479                 if ($len === 0 || $string === '') {
01480                         return '';
01481                 }
01482
01483                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01484                                 // cannot omit $len, when specifying charset
01485                         if ($len == NULL) {
01486                                 $enc = mb_internal_encoding(); // save internal encoding
01487                                 mb_internal_encoding($charset);
01488                                 $str = mb_substr($string, $start);
01489                                 mb_internal_encoding($enc); // restore internal encoding
01490
01491                                 return $str;
01492                         }
01493                         else {
01494                                 return mb_substr($string, $start, $len, $charset);
01495                         }
01496                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01497                                 // cannot omit $len, when specifying charset
01498                         if ($len == NULL) {
01499                                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
01500                                 iconv_set_encoding('internal_encoding', $charset);
01501                                 $str = iconv_substr($string, $start);
01502                                 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
01503
01504                                 return $str;
01505                         }
01506                         else {
01507                                 return iconv_substr($string, $start, $len, $charset);
01508                         }
01509                 } elseif ($charset == 'utf-8') {
01510                         return $this->utf8_substr($string, $start, $len);
01511                 } elseif ($this->eucBasedSets[$charset]) {
01512                         return $this->euc_substr($string, $start, $charset, $len);
01513                 } elseif ($this->twoByteSets[$charset]) {
01514                         return substr($string, $start * 2, $len * 2);
01515                 } elseif ($this->fourByteSets[$charset]) {
01516                         return substr($string, $start * 4, $len * 4);
01517                 }
01518
01519                         // treat everything else as single-byte encoding
01520                 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
01521         }
01522
01533         function strlen($charset, $string) {
01534                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01535                         return mb_strlen($string, $charset);
01536                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01537                         return iconv_strlen($string, $charset);
01538                 } elseif ($charset == 'utf-8') {
01539                         return $this->utf8_strlen($string);
01540                 } elseif ($this->eucBasedSets[$charset]) {
01541                         return $this->euc_strlen($string, $charset);
01542                 } elseif ($this->twoByteSets[$charset]) {
01543                         return strlen($string) / 2;
01544                 } elseif ($this->fourByteSets[$charset]) {
01545                         return strlen($string) / 4;
01546                 }
01547                         // treat everything else as single-byte encoding
01548                 return strlen($string);
01549         }
01550
01561         protected function cropMbstring($charset, $string, $len, $crop = '') {
01562                 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
01563                         return $string;
01564                 }
01565
01566                 if ($len > 0) {
01567                         $string = mb_substr($string, 0, $len, $charset) . $crop;
01568                 } else {
01569                         $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
01570                 }
01571
01572                 return $string;
01573         }
01574
01587         function crop($charset, $string, $len, $crop = '') {
01588                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01589                         return $this->cropMbstring($charset, $string, $len, $crop);
01590                 }
01591
01592                 if (intval($len) == 0) {
01593                         return $string;
01594                 }
01595
01596                 if ($charset == 'utf-8') {
01597                         $i = $this->utf8_char2byte_pos($string, $len);
01598                 } elseif ($this->eucBasedSets[$charset]) {
01599                         $i = $this->euc_char2byte_pos($string, $len, $charset);
01600                 } else {
01601                         if ($len > 0) {
01602                                 $i = $len;
01603                         } else {
01604                                 $i = strlen($string) + $len;
01605                                 if ($i <= 0) {
01606                                         $i = FALSE;
01607                                 }
01608                         }
01609                 }
01610
01611                 if ($i === FALSE) { // $len outside actual string length
01612                         return $string;
01613                 } else {
01614                         if ($len > 0) {
01615                                 if (strlen($string{$i})) {
01616                                         return substr($string, 0, $i) . $crop;
01617
01618                                 }
01619                         } else {
01620                                 if (strlen($string{$i - 1})) {
01621                                         return $crop . substr($string, $i);
01622                                 }
01623                         }
01624
01625                         /*
01626                            if (abs($len)<$this->strlen($charset,$string))       {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01627                                    if ($len > 0)        {
01628                                            return substr($string,0,$i).$crop;
01629                                    } else {
01630                                            return $crop.substr($string,$i);
01631                                    }
01632                            }
01633    */
01634                 }
01635                 return $string;
01636         }
01637
01648         function strtrunc($charset, $string, $len) {
01649                 if ($len <= 0) {
01650                         return '';
01651                 }
01652
01653                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01654                         return mb_strcut($string, 0, $len, $charset);
01655                 } elseif ($charset == 'utf-8') {
01656                         return $this->utf8_strtrunc($string, $len);
01657                 } elseif ($this->eucBasedSets[$charset]) {
01658                         return $this->euc_strtrunc($string, $len, $charset);
01659                 } elseif ($this->twoByteSets[$charset]) {
01660                         if ($len % 2) {
01661                                 $len--;
01662                         } // don't cut at odd positions
01663                 } elseif ($this->fourByteSets[$charset]) {
01664                         $x = $len % 4;
01665                         $len -= $x; // realign to position dividable by four
01666                 }
01667                         // treat everything else as single-byte encoding
01668                 return substr($string, 0, $len);
01669         }
01670
01686         function conv_case($charset, $string, $case) {
01687                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01688                         if ($case == 'toLower') {
01689                                 $string = mb_strtolower($string, $charset);
01690                         } else {
01691                                 $string = mb_strtoupper($string, $charset);
01692                         }
01693                 } elseif ($charset == 'utf-8') {
01694                         $string = $this->utf8_char_mapping($string, 'case', $case);
01695                 } elseif (isset($this->eucBasedSets[$charset])) {
01696                         $string = $this->euc_char_mapping($string, $charset, 'case', $case);
01697                 } else {
01698                                 // treat everything else as single-byte encoding
01699                         $string = $this->sb_char_mapping($string, $charset, 'case', $case);
01700                 }
01701
01702                 return $string;
01703         }
01704
01712         function specCharsToASCII($charset, $string) {
01713                 if ($charset == 'utf-8') {
01714                         $string = $this->utf8_char_mapping($string, 'ascii');
01715                 } elseif (isset($this->eucBasedSets[$charset])) {
01716                         $string = $this->euc_char_mapping($string, $charset, 'ascii');
01717                 } else {
01718                                 // treat everything else as single-byte encoding
01719                         $string = $this->sb_char_mapping($string, $charset, 'ascii');
01720                 }
01721
01722                 return $string;
01723         }
01724
01725
01734         public function getPreferredClientLanguage($languageCodesList) {
01735                 $allLanguageCodes = array();
01736                 $selectedLanguage = 'default';
01737
01738                         // get all languages where TYPO3 code is the same as the ISO code
01739                 foreach ($this->charSetArray as $typo3Lang => $charSet) {
01740                         $allLanguageCodes[$typo3Lang] = $typo3Lang;
01741                 }
01742
01743                         // get all languages where TYPO3 code differs from ISO code
01744                         // or needs the country part
01745                         // the iso codes will here overwrite the default typo3 language in the key
01746                 foreach ($this->isoArray as $typo3Lang => $isoLang) {
01747                         $isoLang = join('-', explode('_', $isoLang));
01748                         $allLanguageCodes[$typo3Lang] = $isoLang;
01749                 }
01750
01751                         // move the iso codes to the (because we're comparing the keys with "isset" later on)
01752                 $allLanguageCodes = array_flip($allLanguageCodes);
01753
01754
01755                 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
01756                         // order the preferred languages after they key
01757                 $sortedPreferredLanguages = array();
01758                 foreach ($preferredLanguages as $preferredLanguage) {
01759                         $quality = 1.0;
01760                         if (strpos($preferredLanguage, ';q=') !== FALSE) {
01761                                 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
01762                         }
01763                         $sortedPreferredLanguages[$preferredLanguage] = $quality;
01764                 }
01765
01766                         // loop through the languages, with the highest priority first
01767                 arsort($sortedPreferredLanguages, SORT_NUMERIC);
01768                 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
01769                         if (isset($allLanguageCodes[$preferredLanguage])) {
01770                                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
01771                                 break;
01772                         }
01773
01774                                 // strip the country code from the end
01775                         list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
01776                         if (isset($allLanguageCodes[$preferredLanguage])) {
01777                                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
01778                                 break;
01779                         }
01780                 }
01781                 if (!$selectedLanguage || $selectedLanguage == 'en') {
01782                         $selectedLanguage = 'default';
01783                 }
01784                 return $selectedLanguage;
01785         }
01786
01787
01788         /********************************************
01789          *
01790          * Internal string operation functions
01791          *
01792          ********************************************/
01793
01804         function sb_char_mapping($str, $charset, $mode, $opt = '') {
01805                 switch ($mode) {
01806                         case 'case':
01807                                 if (!$this->initCaseFolding($charset)) {
01808                                         return $str;
01809                                 } // do nothing
01810                                 $map =& $this->caseFolding[$charset][$opt];
01811                                 break;
01812
01813                         case 'ascii':
01814                                 if (!$this->initToASCII($charset)) {
01815                                         return $str;
01816                                 } // do nothing
01817                                 $map =& $this->toASCII[$charset];
01818                                 break;
01819
01820                         default:
01821                                 return $str;
01822                 }
01823
01824                 $out = '';
01825                 for ($i = 0; strlen($str{$i}); $i++) {
01826                         $c = $str{$i};
01827                         if (isset($map[$c])) {
01828                                 $out .= $map[$c];
01829                         } else {
01830                                 $out .= $c;
01831                         }
01832                 }
01833
01834                 return $out;
01835         }
01836
01837
01838         /********************************************
01839          *
01840          * Internal UTF-8 string operation functions
01841          *
01842          ********************************************/
01843
01855         function utf8_substr($str, $start, $len = NULL) {
01856                 if (!strcmp($len, '0')) {
01857                         return '';
01858                 }
01859
01860                 $byte_start = $this->utf8_char2byte_pos($str, $start);
01861                 if ($byte_start === FALSE) {
01862                         if ($start > 0) {
01863                                 return FALSE; // $start outside string length
01864                         } else {
01865                                 $start = 0;
01866                         }
01867                 }
01868
01869                 $str = substr($str, $byte_start);
01870
01871                 if ($len != NULL) {
01872                         $byte_end = $this->utf8_char2byte_pos($str, $len);
01873                         if ($byte_end === FALSE) // $len outside actual string length
01874                         {
01875                                 return $len < 0 ? '' : $str;
01876                         } // When length is less than zero and exceeds, then we return blank string.
01877                         else
01878                         {
01879                                 return substr($str, 0, $byte_end);
01880                         }
01881                 }
01882                 else    {
01883                         return $str;
01884                 }
01885         }
01886
01896         function utf8_strlen($str) {
01897                 $n = 0;
01898                 for ($i = 0; strlen($str{$i}); $i++) {
01899                         $c = ord($str{$i});
01900                         if (!($c & 0x80)) // single-byte (0xxxxxx)
01901                         {
01902                                 $n++;
01903                         }
01904                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
01905                         {
01906                                 $n++;
01907                         }
01908                 }
01909                 return $n;
01910         }
01911
01921         function utf8_strtrunc($str, $len) {
01922                 $i = $len - 1;
01923                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01924                         for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
01925                         if ($i <= 0) {
01926                                 return '';
01927                         } // sanity check
01928                         for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
01929                         if ($bc + $i > $len) {
01930                                 return substr($str, 0, $i);
01931                         }
01932                         // fallthru: multibyte char fits into length
01933                 }
01934                 return substr($str, 0, $len);
01935         }
01936
01947         function utf8_strpos($haystack, $needle, $offset = 0) {
01948                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01949                         return mb_strpos($haystack, $needle, $offset, 'utf-8');
01950                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01951                         return iconv_strpos($haystack, $needle, $offset, 'utf-8');
01952                 }
01953
01954                 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
01955                 if ($byte_offset === FALSE) {
01956                         return FALSE;
01957                 } // offset beyond string length
01958
01959                 $byte_pos = strpos($haystack, $needle, $byte_offset);
01960                 if ($byte_pos === FALSE) {
01961                         return FALSE;
01962                 } // needle not found
01963
01964                 return $this->utf8_byte2char_pos($haystack, $byte_pos);
01965         }
01966
01976         function utf8_strrpos($haystack, $needle) {
01977                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01978                         return mb_strrpos($haystack, $needle, 'utf-8');
01979                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01980                         return iconv_strrpos($haystack, $needle, 'utf-8');
01981                 }
01982
01983                 $byte_pos = strrpos($haystack, $needle);
01984                 if ($byte_pos === FALSE) {
01985                         return FALSE;
01986                 } // needle not found
01987
01988                 return $this->utf8_byte2char_pos($haystack, $byte_pos);
01989         }
01990
02000         function utf8_char2byte_pos($str, $pos) {
02001                 $n = 0; // number of characters found
02002                 $p = abs($pos); // number of characters wanted
02003
02004                 if ($pos >= 0) {
02005                         $i = 0;
02006                         $d = 1;
02007                 } else {
02008                         $i = strlen($str) - 1;
02009                         $d = -1;
02010                 }
02011
02012                 for (; strlen($str{$i}) && $n < $p; $i += $d) {
02013                         $c = (int) ord($str{$i});
02014                         if (!($c & 0x80)) // single-byte (0xxxxxx)
02015                         {
02016                                 $n++;
02017                         }
02018                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
02019                         {
02020                                 $n++;
02021                         }
02022                 }
02023                 if (!strlen($str{$i})) {
02024                         return FALSE;
02025                 } // offset beyond string length
02026
02027                 if ($pos >= 0) {
02028                                 // skip trailing multi-byte data bytes
02029                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
02030                                 $i++;
02031                         }
02032                 } else {
02033                                 // correct offset
02034                         $i++;
02035                 }
02036
02037                 return $i;
02038         }
02039
02049         function utf8_byte2char_pos($str, $pos) {
02050                 $n = 0; // number of characters
02051                 for ($i = $pos; $i > 0; $i--) {
02052                         $c = (int) ord($str{$i});
02053                         if (!($c & 0x80)) // single-byte (0xxxxxx)
02054                         {
02055                                 $n++;
02056                         }
02057                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
02058                         {
02059                                 $n++;
02060                         }
02061                 }
02062                 if (!strlen($str{$i})) {
02063                         return FALSE;
02064                 } // offset beyond string length
02065
02066                 return $n;
02067         }
02068
02078         function utf8_char_mapping($str, $mode, $opt = '') {
02079                 if (!$this->initUnicodeData($mode)) {
02080                         return $str;
02081                 } // do nothing
02082
02083                 $out = '';
02084                 switch ($mode) {
02085                         case 'case':
02086                                 $map =& $this->caseFolding['utf-8'][$opt];
02087                                 break;
02088
02089                         case 'ascii':
02090                                 $map =& $this->toASCII['utf-8'];
02091                                 break;
02092
02093                         default:
02094                                 return $str;
02095                 }
02096
02097                 for ($i = 0; strlen($str{$i}); $i++) {
02098                         $c = ord($str{$i});
02099                         if (!($c & 0x80)) // single-byte (0xxxxxx)
02100                         {
02101                                 $mbc = $str{$i};
02102                         }
02103                         elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
02104                                 for ($bc = 0; $c & 0x80; $c = $c << 1) {
02105                                         $bc++;
02106                                 } // calculate number of bytes
02107                                 $mbc = substr($str, $i, $bc);
02108                                 $i += $bc - 1;
02109                         }
02110
02111                         if (isset($map[$mbc])) {
02112                                 $out .= $map[$mbc];
02113                         } else {
02114                                 $out .= $mbc;
02115                         }
02116                 }
02117
02118                 return $out;
02119         }
02120
02121
02122         /********************************************
02123          *
02124          * Internal EUC string operation functions
02125          *
02126          * Extended Unix Code:
02127          *  ASCII compatible 7bit single bytes chars
02128          *  8bit two byte chars
02129          *
02130          * Shift-JIS is treated as a special case.
02131          *
02132          ********************************************/
02133
02144         function euc_strtrunc($str, $len, $charset) {
02145                 $sjis = ($charset == 'shift_jis');
02146                 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
02147                         $c = ord($str{$i});
02148                         if ($sjis) {
02149                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
02150                                         $i++;
02151                                 } // advance a double-byte char
02152                         }
02153                         else {
02154                                 if ($c >= 0x80) {
02155                                         $i++;
02156                                 } // advance a double-byte char
02157                         }
02158                 }
02159                 if (!strlen($str{$i})) {
02160                         return $str;
02161                 } // string shorter than supplied length
02162
02163                 if ($i > $len) {
02164                         return substr($str, 0, $len - 1); // we ended on a first byte
02165                 } else {
02166                         return substr($str, 0, $len);
02167                 }
02168         }
02169
02180         function euc_substr($str, $start, $charset, $len = NULL) {
02181                 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
02182                 if ($byte_start === FALSE) {
02183                         return FALSE;
02184                 } // $start outside string length
02185
02186                 $str = substr($str, $byte_start);
02187
02188                 if ($len != NULL) {
02189                         $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
02190                         if ($byte_end === FALSE) // $len outside actual string length
02191                         {
02192                                 return $str;
02193                         }
02194                         else
02195                         {
02196                                 return substr($str, 0, $byte_end);
02197                         }
02198                 }
02199                 else    {
02200                         return $str;
02201                 }
02202         }
02203
02213         function euc_strlen($str, $charset) {
02214                 $sjis = ($charset == 'shift_jis');
02215                 $n = 0;
02216                 for ($i = 0; strlen($str{$i}); $i++) {
02217                         $c = ord($str{$i});
02218                         if ($sjis) {
02219                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
02220                                         $i++;
02221                                 } // advance a double-byte char
02222                         }
02223                         else {
02224                                 if ($c >= 0x80) {
02225                                         $i++;
02226                                 } // advance a double-byte char
02227                         }
02228
02229                         $n++;
02230                 }
02231
02232                 return $n;
02233         }
02234
02244         function euc_char2byte_pos($str, $pos, $charset) {
02245                 $sjis = ($charset == 'shift_jis');
02246                 $n = 0; // number of characters seen
02247                 $p = abs($pos); // number of characters wanted
02248
02249                 if ($pos >= 0) {
02250                         $i = 0;
02251                         $d = 1;
02252                 } else {
02253                         $i = strlen($str) - 1;
02254                         $d = -1;
02255                 }
02256
02257                 for (; strlen($str{$i}) && $n < $p; $i += $d) {
02258                         $c = ord($str{$i});
02259                         if ($sjis) {
02260                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
02261                                         $i += $d;
02262                                 } // advance a double-byte char
02263                         }
02264                         else {
02265                                 if ($c >= 0x80) {
02266                                         $i += $d;
02267                                 } // advance a double-byte char
02268                         }
02269
02270                         $n++;
02271                 }
02272                 if (!strlen($str{$i})) {
02273                         return FALSE;
02274                 } // offset beyond string length
02275
02276                 if ($pos < 0) {
02277                         $i++;
02278                 } // correct offset
02279
02280                 return $i;
02281         }
02282
02293         function euc_char_mapping($str, $charset, $mode, $opt = '') {
02294                 switch ($mode) {
02295                         case 'case':
02296                                 if (!$this->initCaseFolding($charset)) {
02297                                         return $str;
02298                                 } // do nothing
02299                                 $map =& $this->caseFolding[$charset][$opt];
02300                                 break;
02301
02302                         case 'ascii':
02303                                 if (!$this->initToASCII($charset)) {
02304                                         return $str;
02305                                 } // do nothing
02306                                 $map =& $this->toASCII[$charset];
02307                                 break;
02308
02309                         default:
02310                                 return $str;
02311                 }
02312
02313                 $sjis = ($charset == 'shift_jis');
02314                 $out = '';
02315                 for ($i = 0; strlen($str{$i}); $i++) {
02316                         $mbc = $str{$i};
02317                         $c = ord($mbc);
02318
02319                         if ($sjis) {
02320                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
02321                                         $mbc = substr($str, $i, 2);
02322                                         $i++;
02323                                 }
02324                         }
02325                         else {
02326                                 if ($c >= 0x80) { // a double-byte char
02327                                         $mbc = substr($str, $i, 2);
02328                                         $i++;
02329                                 }
02330                         }
02331
02332                         if (isset($map[$mbc])) {
02333                                 $out .= $map[$mbc];
02334                         } else {
02335                                 $out .= $mbc;
02336                         }
02337                 }
02338
02339                 return $out;
02340         }
02341
02342 }
02343
02344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
02345         include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02346 }
02347
02348 ?>