Ignore:
Timestamp:
Oct 13, 2012, 5:40:14 PM (12 years ago)
Author:
rvelices
Message:

feature 2760: allow tag exclusion in quick search

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/include/functions_search.inc.php

    r18207 r18636  
    306306  {
    307307    $ch = $q[$i];
    308     if ($crt_token_modifier&QST_QUOTED==0)
     308    if ( ($crt_token_modifier&QST_QUOTED)==0)
    309309    {
    310310        if ($ch=='"')
     
    357357        $crt_token = ""; $crt_token_modifier = 0;
    358358        $state=0;
    359         break;
    360359      }
    361360      else
     
    363362    }
    364363  }
     364
    365365  if (strlen($crt_token))
    366366  {
     
    390390
    391391/**
    392  * returns the LIKE sql clause corresponding to the quick search query 
     392 * returns the LIKE sql clause corresponding to the quick search query
    393393 * that has been split into tokens
    394394 * for example file LIKE '%john%' OR file LIKE '%bill%'.
     
    410410
    411411  return count($clauses) ? '('.implode(' OR ', $clauses).')' : null;
     412}
     413
     414/**
     415*/
     416function get_qsearch_tags($tokens, $token_modifiers, &$token_tag_ids, &$not_tag_ids, &$all_tags)
     417{
     418  $token_tag_ids = array_fill(0, count($tokens), array() );
     419  $not_tag_ids = $all_tags = array();
     420
     421  $token_tag_scores = $token_tag_ids;
     422  $transliterated_tokens = array();
     423  foreach ($tokens as $token)
     424  {
     425    $transliterated_tokens[] = transliterate($token);
     426  }
     427
     428  $query = '
     429SELECT t.*, COUNT(image_id) AS counter
     430  FROM '.TAGS_TABLE.' t
     431    INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
     432  GROUP BY id';
     433  $result = pwg_query($query);
     434  while ($tag = pwg_db_fetch_assoc($result))
     435  {
     436    $transliterated_tag = transliterate($tag['name']);
     437
     438    // find how this tag matches query tokens
     439    for ($i=0; $i<count($tokens); $i++)
     440    {
     441      $transliterated_token = $transliterated_tokens[$i];
     442
     443      $match = false;
     444      $pos = 0;
     445      while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
     446      {
     447        if ( ($token_modifiers[$i]&QST_WILDCARD)==QST_WILDCARD )
     448        {// wildcard in this token
     449          $match = 1;
     450          break;
     451        }
     452        $token_len = strlen($transliterated_token);
     453
     454        // search begin of word
     455        $wbegin_len=0; $wbegin_char=' ';
     456        while ($pos-$wbegin_len > 0)
     457        {
     458          if (! is_word_char($transliterated_tag[$pos-$wbegin_len-1]) )
     459          {
     460            $wbegin_char = $transliterated_tag[$pos-$wbegin_len-1];
     461            break;
     462          }
     463          $wbegin_len++;
     464        }
     465
     466        // search end of word
     467        $wend_len=0; $wend_char=' ';
     468        while ($pos+$token_len+$wend_len < strlen($transliterated_tag))
     469        {
     470          if (! is_word_char($transliterated_tag[$pos+$token_len+$wend_len]) )
     471          {
     472            $wend_char = $transliterated_tag[$pos+$token_len+$wend_len];
     473            break;
     474          }
     475          $wend_len++;
     476        }
     477
     478        $this_score = 0;
     479        if ( ($token_modifiers[$i]&QST_WILDCARD)==0 )
     480        {// no wildcard begin or end
     481          if ($token_len <= 2)
     482          {// search for 1 or 2 characters must match exactly to avoid retrieving too much data
     483            if ($wbegin_len==0 && $wend_len==0 && !is_odd_wbreak_begin($wbegin_char) && !is_odd_wbreak_end($wend_char) )
     484              $this_score = 1;
     485          }
     486          elseif ($token_len == 3)
     487          {
     488            if ($wbegin_len==0)
     489              $this_score = $token_len / ($token_len + $wend_len);
     490          }
     491          else
     492          {
     493            $this_score = $token_len / ($token_len + 1.1 * $wbegin_len + 0.9 * $wend_len);
     494          }
     495        }
     496
     497        if ($this_score>0)
     498          $match = max($match, $this_score );
     499        $pos++;
     500      }
     501
     502      if ($match)
     503      {
     504        $tag_id = (int)$tag['id'];
     505        $all_tags[$tag_id] = $tag;
     506        $token_tag_ids[$i][] = $tag_id;
     507        $token_tag_scores[$i][] = $match;
     508      }
     509    }
     510  }
     511
     512  // process not tags
     513  for ($i=0; $i<count($tokens); $i++)
     514  {
     515    if ( ! ($token_modifiers[$i]&QST_NOT) )
     516      continue;
     517
     518    array_multisort($token_tag_scores[$i], SORT_DESC|SORT_NUMERIC, $token_tag_ids[$i]);
     519
     520    for ($j=0; $j<count($token_tag_scores[$i]); $j++)
     521    {
     522      if ($token_tag_scores[$i][$j] < 0.8)
     523        break;
     524      if ($j>0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0])
     525        break;
     526      $tag_id = $token_tag_ids[$i][$j];
     527      if ( isset($all_tags[$tag_id]) )
     528      {
     529        unset($all_tags[$tag_id]);
     530        $not_tag_ids[] = $tag_id;
     531      }
     532    }
     533    $token_tag_ids[$i] = array();
     534  }
     535
     536  // process regular tags
     537  for ($i=0; $i<count($tokens); $i++)
     538  {
     539    if ( $token_modifiers[$i]&QST_NOT )
     540      continue;
     541
     542    array_multisort($token_tag_scores[$i], SORT_DESC|SORT_NUMERIC, $token_tag_ids[$i]);
     543
     544    $counter = 0;
     545    for ($j=0; $j<count($token_tag_scores[$i]); $j++)
     546    {
     547      $tag_id = $token_tag_ids[$i][$j];
     548      if ( ! isset($all_tags[$tag_id]) )
     549      {
     550        array_splice($token_tag_ids[$i], $j, 1);
     551        array_splice($token_tag_scores[$i], $j, 1);
     552      }
     553
     554      $counter += $all_tags[$tag_id]['counter'];
     555      if ($counter > 200 && $j>0 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j] )
     556      {// "many" images in previous tags and starting from this tag is less relevent
     557        array_splice($token_tag_ids[$i], $j);
     558        array_splice($token_tag_scores[$i], $j);
     559        break;
     560      }
     561    }
     562  }
     563 
     564  usort($all_tags, 'tag_alpha_compare');
     565  foreach ( $all_tags as &$tag )
     566    $tag['name'] = trigger_event('render_tag_name', $tag['name']);
    412567}
    413568
     
    445600  }
    446601  $debug[] = '<!--'.count($tokens).' tokens';
    447  
     602
    448603  $q_like_field = '@@__db_field__@@'; //something never in a search
    449604  $q_like_clause = get_qsearch_like_clause($tokens, $token_modifiers, $q_like_field );
     
    486641  }
    487642  $debug[] = count($by_weights).' fulltext';
    488   $debug[] = 'ft score min:'.min($by_weights).' max:'.max($by_weights);
    489 
    490 
    491   // Step 2 - search tags corresponding to the query $q ========================
    492   $transliterated_tokens = array();
    493   $token_tags = array();
    494   foreach ($tokens as $token)
    495   {
    496     $transliterated_tokens[] = transliterate($token);
    497     $token_tags[] = array();
    498   }
    499 
    500   // Step 2.1 - find match tags for every token in the query search
    501   $all_tags = array();
    502   $query = '
    503 SELECT id, name, url_name, COUNT(image_id) AS nb_images
    504   FROM '.TAGS_TABLE.'
    505     INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id
    506   GROUP BY id';
    507   $result = pwg_query($query);
    508   while ($tag = pwg_db_fetch_assoc($result))
    509   {
    510     $transliterated_tag = transliterate($tag['name']);
    511 
    512     // find how this tag matches query tokens
    513     for ($i=0; $i<count($tokens); $i++)
    514     {
    515       if ($token_modifiers[$i]&QST_NOT)
    516         continue;// ignore this NOT token
    517       $transliterated_token = $transliterated_tokens[$i];
    518 
    519       $match = false;
    520       $pos = 0;
    521       while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false)
    522       {
    523         if ( ($token_modifiers[$i]&QST_WILDCARD)==QST_WILDCARD )
    524         {// wildcard in this token
    525           $match = 1;
    526           break;
    527         }
    528         $token_len = strlen($transliterated_token);
    529 
    530         // search begin of word
    531         $wbegin_len=0; $wbegin_char=' ';
    532         while ($pos-$wbegin_len > 0)
    533         {
    534           if (! is_word_char($transliterated_tag[$pos-$wbegin_len-1]) )
    535           {
    536             $wbegin_char = $transliterated_tag[$pos-$wbegin_len-1];
    537             break;
    538           }
    539           $wbegin_len++;
    540         }
    541 
    542         // search end of word
    543         $wend_len=0; $wend_char=' ';
    544         while ($pos+$token_len+$wend_len < strlen($transliterated_tag))
    545         {
    546           if (! is_word_char($transliterated_tag[$pos+$token_len+$wend_len]) )
    547           {
    548             $wend_char = $transliterated_tag[$pos+$token_len+$wend_len];
    549             break;
    550           }
    551           $wend_len++;
    552         }
    553 
    554         $this_score = 0;
    555         if ( ($token_modifiers[$i]&QST_WILDCARD)==0 )
    556         {// no wildcard begin or end
    557           if ($token_len <= 2)
    558           {// search for 1 or 2 characters must match exactly to avoid retrieving too much data
    559             if ($wbegin_len==0 && $wend_len==0 && !is_odd_wbreak_begin($wbegin_char) && !is_odd_wbreak_end($wend_char) )
    560               $this_score = 1;
    561           }
    562           elseif ($token_len == 3)
    563           {
    564             if ($wbegin_len==0)
    565               $this_score = $token_len / ($token_len + $wend_len);
    566           }
    567           else
    568           {
    569             $this_score = $token_len / ($token_len + 1.1 * $wbegin_len + 0.9 * $wend_len);
    570           }
    571         }
    572 
    573         if ($this_score>0)
    574           $match = max($match, $this_score );
    575         $pos++;
    576       }
    577 
    578       if ($match)
    579       {
    580         $tag_id = (int)$tag['id'];
    581         $all_tags[$tag_id] = $tag;
    582         $token_tags[$i][] = array('tag_id'=>$tag_id, 'score'=>$match);
    583       }
    584     }
    585   }
    586   $search_results['qs']['matching_tags']=$all_tags;
    587   $debug[] = count($all_tags).' tags';
    588 
    589   // Step 2.2 - reduce matching tags for every token in the query search
    590   $score_cmp_fn = create_function('$a,$b', 'return 100*($b["score"]-$a["score"]);');
    591   foreach ($token_tags as &$tt)
    592   {
    593     usort($tt, $score_cmp_fn);
    594     $nb_images = 0;
    595     $prev_score = 0;
    596     for ($j=0; $j<count($tt); $j++)
    597     {
    598       if ($nb_images > 200 && $prev_score > $tt[$j]['score'] )
    599       {// "many" images in previous tags and starting from this tag is less relevent
    600         $tt = array_slice( $tt, 0, $j);
    601         break;
    602       }
    603       $nb_images += $all_tags[ $tt[$j]['tag_id'] ]['nb_images'];
    604       $prev_score = $tt[$j]['score'];
    605     }
    606   }
    607 
    608   // Step 2.3 - get the images for tags
    609   for ($i=0; $i<count($token_tags); $i++)
    610   {
    611     $tag_ids = array();
    612     foreach($token_tags[$i] as $arr)
    613       $tag_ids[] = $arr['tag_id'];
    614     $tag_ids = array_unique($tag_ids);
     643  if (!empty($by_weights))
     644  {
     645    $debug[] = 'ft score min:'.min($by_weights).' max:'.max($by_weights);
     646  }
     647
     648
     649  // Step 2 - get the tags and the images for tags
     650  get_qsearch_tags($tokens, $token_modifiers, $token_tag_ids, $not_tag_ids, $search_results['qs']['matching_tags']);
     651  $debug[] = count($search_results['qs']['matching_tags']).' tags';
     652
     653  for ($i=0; $i<count($token_tag_ids); $i++)
     654  {
     655    $tag_ids = $token_tag_ids[$i];
    615656    $debug[] = count($tag_ids).' unique tags';
    616657
     
    619660      $tag_photo_count=0;
    620661      $query = '
    621 SELECT image_id
    622   FROM '.IMAGE_TAG_TABLE.'
     662SELECT image_id FROM '.IMAGE_TAG_TABLE.'
    623663  WHERE tag_id IN ('.implode(',',$tag_ids).')
    624664  GROUP BY image_id';
     
    630670        $tag_photo_count++;
    631671      }
    632       $debug[] = $tag_photo_count.' photos for tags';
    633       $debug[] = count($by_weights).' photos after tags';
     672      $debug[] = $tag_photo_count.' photos for tag';
     673      $debug[] = count($by_weights).' photos after';
    634674    }
    635675  }
     
    664704  }
    665705
     706  if (!empty($not_tag_ids))
     707  {
     708    $query = '
     709SELECT image_id FROM '.IMAGE_TAG_TABLE.'
     710  WHERE tag_id IN ('.implode(',',$not_tag_ids).')
     711  GROUP BY image_id';
     712      $result = pwg_query($query);
     713      while ($row = pwg_db_fetch_row($result))
     714      {
     715        $id = $row[0];
     716        unset($by_weights[$id]);
     717      }
     718      $debug[] = count($by_weights).' after not tags';
     719  }
    666720  // Step 4 - now we have $by_weights ( array image id => weight ) that need
    667721  // permission checks and/or matching categories to get images from
Note: See TracChangeset for help on using the changeset viewer.