Ignore:
Timestamp:
09/04/12 22:03:33 (7 years ago)
Author:
rvelices
Message:

bug 2735: fix/improve non latin language tags

  1. non latin tags (greek/cyrillic...) are not sorted case-insesitive and group by letter view in tag list is not case insesitive
  2. quick searching tag names does not perform correctly accent folding (e.g. Köln and Koln do not match) and case insesitivity for non latin letters
  3. missing from remove_accents characters in romanian language (Latin Extended-B) ? c8 98 = LATIN CAPITAL LETTER S WITH COMMA BELOW ? c8 99 = LATIN SMALL LETTER S WITH COMMA BELOW ? c8 9a = LATIN CAPITAL LETTER T WITH COMMA BELOW ? c8 9b = LATIN SMALL LETTER T WITH COMMA BELOW
  4. str2url allow non latin letters in output only if the input does not contain any valid lating letter/digit. we should always allow non latin letters in output
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/include/functions.inc.php

    r17649 r17748  
    204204 * @param string Str 
    205205 */ 
    206 function seems_utf8($Str) { # by bmorel at ssi dot fr 
     206function seems_utf8($Str) { 
     207  // OBSOLETE !!! 
     208  return qualify_utf8($Str) >= 0; 
     209} 
     210 
     211/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */ 
     212function qualify_utf8($Str) 
     213{ 
     214  $ret = 0; 
    207215  for ($i=0; $i<strlen($Str); $i++) { 
    208216    if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 
    209     elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 
     217    $ret = 1; 
     218    if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 
    210219    elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 
    211220    elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 
    212221    elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 
    213222    elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 
    214     else return false; # Does not match any model 
     223    else return -1; # Does not match any model 
    215224    for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 
    216225      if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 
    217       return false; 
    218     } 
    219   } 
    220   return true; 
     226        return -1; 
     227    } 
     228  } 
     229  return $ret; 
    221230} 
    222231 
     
    226235function remove_accents($string) 
    227236{ 
    228   if ( !preg_match('/[\x80-\xff]/', $string) ) 
    229     return $string; 
    230  
    231   if (seems_utf8($string)) { 
     237  $utf = qualify_utf8($string); 
     238  if ( $utf == 0 ) 
     239    return $string; // ascii 
     240 
     241  if ( $utf > 0 ) { 
    232242    $chars = array( 
    233243    // Decompositions for Latin-1 Supplement 
     
    324334    "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z', 
    325335    "\xc5\xbe"=>'z', "\xc5\xbf"=>'s', 
     336    // Decompositions for Latin Extended-B 
     337    "\xc8\x98"=>'S', "\xc8\x99"=>'s', 
     338    "\xc8\x9a"=>'T', "\xc8\x9b"=>'t', 
    326339    // Euro Sign 
    327340    "\xe2\x82\xac"=>'E', 
     
    354367} 
    355368 
     369if (function_exists('mb_strtolower') && defined('PWG_CHARSET')) 
     370{ 
     371  function transliterate($term) 
     372  { 
     373    return remove_accents( mb_strtolower($term, PWG_CHARSET) ); 
     374  } 
     375} 
     376else 
     377{ 
     378  function transliterate($term) 
     379  { 
     380    return remove_accents( strtolower($term) ); 
     381  } 
     382} 
     383 
     384 
     385 
    356386/** 
    357387 * simplify a string to insert it into an URL 
     
    362392function str2url($str) 
    363393{ 
    364   $raw = $str; 
    365  
    366   $str = remove_accents($str); 
    367   $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str)); 
     394  $str = $safe = transliterate($str); 
     395  $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str); 
    368396  $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str)); 
    369397  $res = str_replace(' ','_',$str); 
     
    371399  if (empty($res)) 
    372400  { 
    373     $res = str_replace(' ','_', $raw); 
     401    $res = str_replace(' ','_', $safe); 
    374402  } 
    375403 
Note: See TracChangeset for help on using the changeset viewer.