Changeset 10340


Ignore:
Timestamp:
Apr 12, 2011, 9:46:36 PM (10 years ago)
Author:
rvelices
Message:

feature:2248 Improve quick/query search results

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/include/functions_search.inc.php

    r8728 r10340  
    266266}
    267267
     268
     269if (function_exists('mb_strtolower'))
     270{
     271  function transliterate($term)
     272  {
     273    return remove_accents( mb_strtolower($term) );
     274  }
     275}
     276else
     277{
     278  function transliterate($term)
     279  {
     280    return remove_accents( strtolower($term) );
     281  }
     282}
     283
     284function is_word_char($ch)
     285{
     286  return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
     287}
     288
    268289/**
    269  * returns the LIKE sql clause corresponding to the quick search query $q
    270  * and the field $field. example q='john bill', field='file' will return
    271  * file LIKE '%john%' OR file LIKE '%bill%'. Special characters for MySql full
    272  * text search (+,<,>,~) are omitted. The query can contain a phrase:
    273  * 'Pierre "New York"' will return LIKE '%Pierre%' OR LIKE '%New York%'.
    274  * @param string q
    275  * @param string field
    276  * @return string
     290 * analyzes and splits the quick/query search query $q into tokens
     291 * q='john bill' => 2 tokens 'john' 'bill'
     292 * Special characters for MySql full text search (+,<,>,~) appear in the token modifiers.
     293 * The query can contain a phrase: 'Pierre "New York"' will return 'pierre' qnd 'new york'.
    277294 */
    278 function get_qsearch_like_clause($q, $field, $before='%', $after='%')
     295function analyse_qsearch($q, &$qtokens, &$qtoken_modifiers)
    279296{
    280297  $q = stripslashes($q);
     
    293310        if ($ch=='"')
    294311        {
     312          $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
     313          $crt_token = ""; $crt_token_modifier = "q";
     314          $state=1;
     315        }
     316        elseif ( $ch=='*' )
     317        { // wild card
    295318          if (strlen($crt_token))
    296319          {
    297             $tokens[] = $crt_token;
    298             $token_modifiers[] = $crt_token_modifier;
    299             $crt_token = "";
    300             $crt_token_modifier = "";
     320            $crt_token .= $ch;
    301321          }
    302           $state=1;
    303         }
    304         elseif ( $ch=='*' )
    305         { // wild card
    306           $crt_token .= '%';
     322          else
     323          {
     324            $crt_token_modifier .= '*';
     325          }
    307326        }
    308327        elseif ( strcspn($ch, '+-><~')==0 )
     
    310329          if (strlen($crt_token))
    311330          {
    312             $tokens[] = $crt_token;
    313             $token_modifiers[] = $crt_token_modifier;
    314             $crt_token = "";
    315             $crt_token_modifier = "";
     331            $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
     332            $crt_token = ""; $crt_token_modifier = "";
    316333          }
    317334          $crt_token_modifier .= $ch;
     
    321338          if (strlen($crt_token))
    322339          {
    323             $tokens[] = $crt_token;
    324             $token_modifiers[] = $crt_token_modifier;
    325             $crt_token = "";
    326             $crt_token_modifier = "";
     340            $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
     341            $crt_token = ""; $crt_token_modifier = "";
    327342          }
    328343        }
    329344        else
    330345        {
    331           if ( strcspn($ch, '%_')==0)
    332           {// escape LIKE specials %_
    333             $ch = '\\'.$ch;
    334           }
    335346          $crt_token .= $ch;
    336347        }
     
    340351        {
    341352          case '"':
    342             $tokens[] = $crt_token;
    343             $token_modifiers[] = $crt_token_modifier;
    344             $crt_token = "";
    345             $crt_token_modifier = "";
     353            $tokens[] = $crt_token; $token_modifiers[] = $crt_token_modifier;
     354            $crt_token = ""; $crt_token_modifier = "";
    346355            $state=0;
    347356            break;
    348357          default:
    349             if ( strcspn($ch, '%_')==0)
    350             {// escape LIKE specials %_
    351                 $ch = '\\'.$ch;
    352             }
    353358            $crt_token .= $ch;
    354359        }
     
    362367  }
    363368
     369  $qtokens = array();
     370  $qtoken_modifiers = array();
     371  for ($i=0; $i<count($tokens); $i++)
     372  {
     373    if (strstr($token_modifiers[$i], 'q')===false)
     374    {
     375      if ( substr($tokens[$i], -1)=='*' )
     376      {
     377        $tokens[$i] = rtrim($tokens[$i], '*');
     378        $token_modifiers[$i] .= '*';
     379      }
     380    }
     381    if ( strlen($tokens[$i])==0)
     382      continue;
     383    $qtokens[] = $tokens[$i];
     384    $qtoken_modifiers[] = $token_modifiers[$i];
     385  }
     386}
     387
     388
     389/**
     390 * returns the LIKE sql clause corresponding to the quick search query
     391 * that has been split into tokens
     392 * for example file LIKE '%john%' OR file LIKE '%bill%'.
     393 */
     394function get_qsearch_like_clause($tokens, $token_modifiers, $field)
     395{
    364396  $clauses = array();
    365397  for ($i=0; $i<count($tokens); $i++)
    366398  {
    367     $tokens[$i] = trim($tokens[$i], '%');
     399    $token = trim($tokens[$i], '%');
    368400    if (strstr($token_modifiers[$i], '-')!==false)
    369401      continue;
    370     if ( strlen($tokens[$i])==0)
     402    if ( strlen($token==0) )
    371403      continue;
    372     $clauses[] = $field.' LIKE \''.$before.addslashes($tokens[$i]).$after.'\'';
     404    $token = addslashes($token);
     405    $token = str_replace( array('%','_'), array('\\%','\\_'), $token); // escape LIKE specials %_
     406    $clauses[] = $field.' LIKE \'%'.$token.'%\'';
    373407  }
    374408
    375409  return count($clauses) ? '('.implode(' OR ', $clauses).')' : null;
    376410}
    377 
    378411
    379412/**
     
    396429function get_quick_search_results($q, $super_order_by, $images_where='')
    397430{
     431  global $user, $conf;
     432
    398433  $search_results =
    399434    array(
     
    406441    return $search_results;
    407442  }
     443 
     444  analyse_qsearch($q, $tokens, $token_modifiers);
     445
    408446  $q_like_field = '@@__db_field__@@'; //something never in a search
    409   $q_like_clause = get_qsearch_like_clause($q, $q_like_field );
    410 
     447  $q_like_clause = get_qsearch_like_clause($tokens, $token_modifiers, $q_like_field );
    411448
    412449  // Step 1 - first we find matches in #images table ===========================
     
    449486
    450487  // Step 2 - search tags corresponding to the query $q ========================
    451   if (!empty($q_like_clause))
    452   { // search name and url name (without accents)
    453     $query = '
    454 SELECT id, name, url_name
     488  $transliterated_tokens = array();
     489  $token_tags = array();
     490  foreach ($tokens as $token)
     491  {
     492    $transliterated_tokens[] = transliterate($token);
     493    $token_tags[] = array();
     494  }
     495
     496  // Step 2.1 - find match tags for every token in the query search
     497  $all_tags = array();
     498  $query = '
     499SELECT id, name, url_name, COUNT(image_id) AS nb_images
    455500  FROM '.TAGS_TABLE.'
    456   WHERE ('.str_replace($q_like_field, 'CONVERT(name, CHAR)', $q_like_clause).'
    457     OR '.str_replace($q_like_field, 'url_name', $q_like_clause).')';
    458     $tags = hash_from_query($query, 'id');
    459     if ( !empty($tags) )
    460     { // we got some tags; get the images
    461       $search_results['qs']['matching_tags']=$tags;
     501    INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
     502  GROUP BY id';
     503  $result = pwg_query($query);
     504  while ($tag = pwg_db_fetch_assoc($result))
     505  {
     506    $transliterated_tag = transliterate($tag['name']);
     507
     508    // find how this tag matches query tokens
     509    for ($i=0; $i<count($tokens); $i++)
     510    {
     511      if (strstr($token_modifiers[$i], '-')!==false)
     512        continue;// ignore this NOT token
     513      $transliterated_token = $transliterated_tokens[$i];
     514
     515      $match = false;
     516      $pos = 0;
     517      while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
     518      {
     519        if (strstr($token_modifiers[$i], '*')!==false)
     520        {// wildcard in this token
     521          $match = 1;
     522          break;
     523        }
     524        $token_len = strlen($transliterated_token);
     525
     526        $word_begin = $pos;
     527        while ($word_begin>0)
     528        {
     529          if (! is_word_char($transliterated_tag[$word_begin-1]) )
     530            break;
     531          $word_begin--;
     532        }
     533
     534        $word_end = $pos + $token_len;
     535        while ($word_end<strlen($transliterated_tag) && is_word_char($transliterated_tag[$word_end]) )
     536          $word_end++;
     537
     538        $this_score = $token_len / ($word_end-$word_begin);
     539        if ($token_len <= 2)
     540        {// search for 1 or 2 characters must match exactly to avoid retrieving too much data
     541          if ($token_len != $word_end-$word_begin)
     542            $this_score = 0;
     543        }
     544        elseif ($token_len == 3)
     545        {
     546          if ($word_end-$word_begin > 4)
     547            $this_score = 0;
     548        }
     549
     550        if ($this_score>0)
     551          $match = max($match, $this_score );
     552        $pos++;
     553      }
     554
     555      if ($match)
     556      {
     557        $tag_id = (int)$tag['id'];
     558        $all_tags[$tag_id] = $tag;
     559        $token_tags[$i][] = array('tag_id'=>$tag_id, 'score'=>$match);
     560      }
     561    }
     562  }
     563  $search_results['qs']['matching_tags']=$all_tags;
     564
     565  // Step 2.2 - reduce matching tags for every token in the query search
     566  $score_cmp_fn = create_function('$a,$b', 'return 100*($b["score"]-$a["score"]);');
     567  foreach ($token_tags as &$tt)
     568  {
     569    usort($tt, $score_cmp_fn);
     570    $nb_images = 0;
     571    $prev_score = 0;
     572    for ($j=0; $j<count($tt); $j++)
     573    {
     574      if ($nb_images > 200 && $prev_score > $tt[$j]['score'] )
     575      {// "many" images in previous tags and starting from this tag is less relevent
     576        $tt = array_slice( $tt, 0, $j);
     577        break;
     578      }
     579      $nb_images += $all_tags[ $tt[$j]['tag_id'] ]['nb_images'];
     580      $prev_score = $tt[$j]['score'];
     581    }
     582  }
     583
     584  // Step 2.3 - get the images for tags
     585  for ($i=0; $i<count($token_tags); $i++)
     586  {
     587    $tag_ids = array();
     588    foreach($token_tags[$i] as $arr)
     589      $tag_ids[] = $arr['tag_id'];
     590
     591    if (!empty($tag_ids))
     592    {
    462593      $query = '
    463 SELECT image_id, COUNT(tag_id) AS weight
     594SELECT image_id
    464595  FROM '.IMAGE_TAG_TABLE.'
    465   WHERE tag_id IN ('.implode(',',array_keys($tags)).')
     596  WHERE tag_id IN ('.implode(',',$tag_ids).')
    466597  GROUP BY image_id';
    467598      $result = pwg_query($query);
     
    469600      { // weight is important when sorting images by relevance
    470601        $image_id=(int)$row['image_id'];
    471         @$by_weights[$image_id] += $row['weight'];
     602        @$by_weights[$image_id] += 1;
    472603      }
    473604    }
    474605  }
    475606
    476 
    477607  // Step 3 - search categories corresponding to the query $q ==================
    478   global $user;
    479608  $query = '
    480609SELECT id, name, permalink, nb_images
     
    532661    );
    533662
    534   global $conf;
    535663  $query = '
    536664SELECT DISTINCT(id)
Note: See TracChangeset for help on using the changeset viewer.