Changeset 17749


Ignore:
Timestamp:
09/04/12 22:04:34 (6 years ago)
Author:
rvelices
Message:

merge -r17748 from trunk to branch 2.4 bug 2735: fix/improve non latin language tags

  1. non latin tags (greek/cyrillic...) are not sorted case-insesitive and group by letter view in tag list is not case insesitive
  2. quick searching tag names does not perform correctly accent folding (e.g. Köln and Koln do not match) and case insesitivity for non latin letters
  3. missing from remove_accents characters in romanian language (Latin Extended-B) ? c8 98 = LATIN CAPITAL LETTER S WITH COMMA BELOW ? c8 99 = LATIN SMALL LETTER S WITH COMMA BELOW ? c8 9a = LATIN CAPITAL LETTER T WITH COMMA BELOW ? c8 9b = LATIN SMALL LETTER T WITH COMMA BELOW
  4. str2url allow non latin letters in output only if the input does not contain any valid lating letter/digit. we should always allow non latin letters in output
Location:
branches/2.4
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • branches/2.4/include/functions.inc.php

    r15598 r17749  
    204204 * @param string Str 
    205205 */ 
    206 function seems_utf8($Str) { # by bmorel at ssi dot fr 
     206function seems_utf8($Str) { 
     207  // OBSOLETE !!! 
     208  return qualify_utf8($Str) >= 0; 
     209} 
     210 
     211/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */ 
     212function qualify_utf8($Str) 
     213{ 
     214  $ret = 0; 
    207215  for ($i=0; $i<strlen($Str); $i++) { 
    208216    if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb 
    209     elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 
     217    $ret = 1; 
     218    if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb 
    210219    elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb 
    211220    elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb 
    212221    elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb 
    213222    elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b 
    214     else return false; # Does not match any model 
     223    else return -1; # Does not match any model 
    215224    for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 
    216225      if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 
    217       return false; 
    218     } 
    219   } 
    220   return true; 
     226        return -1; 
     227    } 
     228  } 
     229  return $ret; 
    221230} 
    222231 
     
    226235function remove_accents($string) 
    227236{ 
    228   if ( !preg_match('/[\x80-\xff]/', $string) ) 
    229     return $string; 
    230  
    231   if (seems_utf8($string)) { 
     237  $utf = qualify_utf8($string); 
     238  if ( $utf == 0 ) 
     239    return $string; // ascii 
     240 
     241  if ( $utf > 0 ) { 
    232242    $chars = array( 
    233243    // Decompositions for Latin-1 Supplement 
     
    324334    "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z', 
    325335    "\xc5\xbe"=>'z', "\xc5\xbf"=>'s', 
     336    // Decompositions for Latin Extended-B 
     337    "\xc8\x98"=>'S', "\xc8\x99"=>'s', 
     338    "\xc8\x9a"=>'T', "\xc8\x9b"=>'t', 
    326339    // Euro Sign 
    327340    "\xe2\x82\xac"=>'E', 
     
    354367} 
    355368 
     369if (function_exists('mb_strtolower') && defined('PWG_CHARSET')) 
     370{ 
     371  function transliterate($term) 
     372  { 
     373    return remove_accents( mb_strtolower($term, PWG_CHARSET) ); 
     374  } 
     375} 
     376else 
     377{ 
     378  function transliterate($term) 
     379  { 
     380    return remove_accents( strtolower($term) ); 
     381  } 
     382} 
     383 
     384 
     385 
    356386/** 
    357387 * simplify a string to insert it into an URL 
     
    362392function str2url($str) 
    363393{ 
    364   $raw = $str; 
    365  
    366   $str = remove_accents($str); 
    367   $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str)); 
     394  $str = $safe = transliterate($str); 
     395  $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str); 
    368396  $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str)); 
    369397  $res = str_replace(' ','_',$str); 
     
    371399  if (empty($res)) 
    372400  { 
    373     $res = str_replace(' ','_', $raw); 
     401    $res = str_replace(' ','_', $safe); 
    374402  } 
    375403 
  • branches/2.4/include/functions_html.inc.php

    r15384 r17749  
    301301    if (!isset($cache[__FUNCTION__][ $tag['name'] ])) 
    302302    { 
    303       $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name'])); 
     303      $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']); 
    304304    } 
    305305  } 
  • branches/2.4/include/functions_metadata.inc.php

    r12922 r17749  
    9191    // how to detect it so a plugin should do the trick. 
    9292    $value = trigger_event('clean_iptc_value', $value); 
    93     $is_utf8 = seems_utf8($value); 
    94     $value = convert_charset( $value, 
    95       $is_utf8 ? 'utf-8' : 'iso-8859-1', 
    96       get_pwg_charset() ); 
     93    if ( ($qual = qualify_utf8($value)) != 0) 
     94    {// has non ascii chars 
     95      $value = convert_charset( $value, 
     96        $qual>0 ? 'utf-8' : 'iso-8859-1', 
     97        get_pwg_charset() ); 
     98    } 
    9799  } 
    98100  return $value; 
  • branches/2.4/include/functions_search.inc.php

    r12922 r17749  
    267267 
    268268 
    269 if (function_exists('mb_strtolower')) 
    270 { 
    271   function transliterate($term) 
    272   { 
    273     return remove_accents( mb_strtolower($term) ); 
    274   } 
    275 } 
    276 else 
    277 { 
    278   function transliterate($term) 
    279   { 
    280     return remove_accents( strtolower($term) ); 
    281   } 
    282 } 
    283  
    284269function is_word_char($ch) 
    285270{ 
  • branches/2.4/tags.php

    r15578 r17749  
    100100  foreach ($tags as $tag) 
    101101  { 
    102     $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8')); 
     102    $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET); 
    103103 
    104104    if ($current_tag_idx==0) { 
Note: See TracChangeset for help on using the changeset viewer.