Changeset 17749


Ignore:
Timestamp:
Sep 4, 2012, 10:04:34 PM (8 years ago)
Author:
rvelices
Message:

merge -r17748 from trunk to branch 2.4 bug 2735: fix/improve non latin language tags

  1. non latin tags (greek/cyrillic...) are not sorted case-insesitive and group by letter view in tag list is not case insesitive
  2. quick searching tag names does not perform correctly accent folding (e.g. Köln and Koln do not match) and case insesitivity for non latin letters
  3. missing from remove_accents characters in romanian language (Latin Extended-B) ? c8 98 = LATIN CAPITAL LETTER S WITH COMMA BELOW ? c8 99 = LATIN SMALL LETTER S WITH COMMA BELOW ? c8 9a = LATIN CAPITAL LETTER T WITH COMMA BELOW ? c8 9b = LATIN SMALL LETTER T WITH COMMA BELOW
  4. str2url allow non latin letters in output only if the input does not contain any valid lating letter/digit. we should always allow non latin letters in output
Location:
branches/2.4
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • branches/2.4/include/functions.inc.php

    r15598 r17749  
    204204 * @param string Str
    205205 */
    206 function seems_utf8($Str) { # by bmorel at ssi dot fr
     206function seems_utf8($Str) {
     207  // OBSOLETE !!!
     208  return qualify_utf8($Str) >= 0;
     209}
     210
     211/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */
     212function qualify_utf8($Str)
     213{
     214  $ret = 0;
    207215  for ($i=0; $i<strlen($Str); $i++) {
    208216    if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
    209     elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
     217    $ret = 1;
     218    if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
    210219    elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
    211220    elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
    212221    elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
    213222    elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
    214     else return false; # Does not match any model
     223    else return -1; # Does not match any model
    215224    for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
    216225      if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
    217       return false;
    218     }
    219   }
    220   return true;
     226        return -1;
     227    }
     228  }
     229  return $ret;
    221230}
    222231
     
    226235function remove_accents($string)
    227236{
    228   if ( !preg_match('/[\x80-\xff]/', $string) )
    229     return $string;
    230 
    231   if (seems_utf8($string)) {
     237  $utf = qualify_utf8($string);
     238  if ( $utf == 0 )
     239    return $string; // ascii
     240
     241  if ( $utf > 0 ) {
    232242    $chars = array(
    233243    // Decompositions for Latin-1 Supplement
     
    324334    "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z',
    325335    "\xc5\xbe"=>'z', "\xc5\xbf"=>'s',
     336    // Decompositions for Latin Extended-B
     337    "\xc8\x98"=>'S', "\xc8\x99"=>'s',
     338    "\xc8\x9a"=>'T', "\xc8\x9b"=>'t',
    326339    // Euro Sign
    327340    "\xe2\x82\xac"=>'E',
     
    354367}
    355368
     369if (function_exists('mb_strtolower') && defined('PWG_CHARSET'))
     370{
     371  function transliterate($term)
     372  {
     373    return remove_accents( mb_strtolower($term, PWG_CHARSET) );
     374  }
     375}
     376else
     377{
     378  function transliterate($term)
     379  {
     380    return remove_accents( strtolower($term) );
     381  }
     382}
     383
     384
     385
    356386/**
    357387 * simplify a string to insert it into an URL
     
    362392function str2url($str)
    363393{
    364   $raw = $str;
    365 
    366   $str = remove_accents($str);
    367   $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str));
     394  $str = $safe = transliterate($str);
     395  $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str);
    368396  $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str));
    369397  $res = str_replace(' ','_',$str);
     
    371399  if (empty($res))
    372400  {
    373     $res = str_replace(' ','_', $raw);
     401    $res = str_replace(' ','_', $safe);
    374402  }
    375403
  • branches/2.4/include/functions_html.inc.php

    r15384 r17749  
    301301    if (!isset($cache[__FUNCTION__][ $tag['name'] ]))
    302302    {
    303       $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name']));
     303      $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']);
    304304    }
    305305  }
  • branches/2.4/include/functions_metadata.inc.php

    r12922 r17749  
    9191    // how to detect it so a plugin should do the trick.
    9292    $value = trigger_event('clean_iptc_value', $value);
    93     $is_utf8 = seems_utf8($value);
    94     $value = convert_charset( $value,
    95       $is_utf8 ? 'utf-8' : 'iso-8859-1',
    96       get_pwg_charset() );
     93    if ( ($qual = qualify_utf8($value)) != 0)
     94    {// has non ascii chars
     95      $value = convert_charset( $value,
     96        $qual>0 ? 'utf-8' : 'iso-8859-1',
     97        get_pwg_charset() );
     98    }
    9799  }
    98100  return $value;
  • branches/2.4/include/functions_search.inc.php

    r12922 r17749  
    267267
    268268
    269 if (function_exists('mb_strtolower'))
    270 {
    271   function transliterate($term)
    272   {
    273     return remove_accents( mb_strtolower($term) );
    274   }
    275 }
    276 else
    277 {
    278   function transliterate($term)
    279   {
    280     return remove_accents( strtolower($term) );
    281   }
    282 }
    283 
    284269function is_word_char($ch)
    285270{
  • branches/2.4/tags.php

    r15578 r17749  
    100100  foreach ($tags as $tag)
    101101  {
    102     $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8'));
     102    $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET);
    103103
    104104    if ($current_tag_idx==0) {
Note: See TracChangeset for help on using the changeset viewer.