Changeset 28144
- Timestamp:
- Apr 9, 2014, 11:23:49 PM (10 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/include/functions_search.inc.php
r28128 r28144 256 256 } 257 257 258 /**259 * Finds if a char is a letter, a figure or any char of the extended ASCII table (>127).260 *261 * @param char $ch262 * @return bool263 */264 function is_word_char($ch)265 {266 return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;267 }268 269 /**270 * Finds if a char is a special token for word start: [{<=*+271 *272 * @param char $ch273 * @return bool274 */275 function is_odd_wbreak_begin($ch)276 {277 return strpos('[{<=*+', $ch)===false ? false:true;278 }279 280 /**281 * Finds if a char is a special token for word end: ]}>=*+282 *283 * @param char $ch284 * @return bool285 */286 function is_odd_wbreak_end($ch)287 {288 return strpos(']}>=*+', $ch)===false ? false:true;289 }290 258 291 259 … … 296 264 define('QST_WILDCARD_END', 0x10); 297 265 define('QST_WILDCARD', QST_WILDCARD_BEGIN|QST_WILDCARD_END); 298 266 define('QST_BREAK', 0x20); 299 267 300 268 class QSearchScope … … 553 521 if (strlen($token) || (isset($scope) && $scope->nullable)) 554 522 { 523 if (isset($scope)) 524 $modifier |= QST_BREAK; 555 525 $this->tokens[] = new QSingleToken($token, $modifier, $scope); 556 526 } … … 714 684 { 715 685 array_splice($this->tokens, $i, 1); 686 if ($i<count($this->tokens) && $this->tokens[$i]->is_single) 687 { 688 $this->tokens[$i]->modifier |= QST_BREAK; 689 } 716 690 $i--; 717 691 } 692 } 693 694 if ($level>0 && count($this->tokens) && $this->tokens[0]->is_single) 695 { 696 $this->tokens[0]->modifier |= QST_BREAK; 718 697 } 719 698 } … … 837 816 } 838 817 818 function qsearch_get_text_token_search_sql($token, $fields) 819 { 820 $clauses = array(); 821 $variants = array_merge(array($token->term), $token->variants); 822 $fts = array(); 823 foreach ($variants as $variant) 824 { 825 if (mb_strlen($variant)<=3 826 || strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3) 827 {// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive 828 $pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]'; 829 $post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]'; 830 foreach( $fields as $field) 831 $clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\''; 832 } 833 else 834 { 835 $ft = $variant; 836 if ($token->modifier & QST_QUOTED) 837 $ft = '"'.$ft.'"'; 838 if ($token->modifier & QST_WILDCARD_END) 839 $ft .= '*'; 840 $fts[] = $ft; 841 } 842 } 843 844 if (count($fts)) 845 { 846 $clauses[] = 'MATCH('.implode(', ',$fields).') AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)'; 847 } 848 return $clauses; 849 } 850 839 851 function qsearch_get_images(QExpression $expr, QResults $qsr) 840 852 { … … 857 869 case 'photo': 858 870 $clauses[] = $file_like; 859 860 $variants = array_merge(array($token->term), $token->variants); 861 $fts = array(); 862 foreach ($variants as $variant) 863 { 864 if (mb_strlen($variant)<=3 865 || strcspn($variant, '!"#$%&()*+,./:;<=>?@[\]^`{|}~') < 3) 866 {// odd term or too short for full text search; fallback to regex but unfortunately this is diacritic/accent sensitive 867 $pre = ($token->modifier & QST_WILDCARD_BEGIN) ? '' : '[[:<:]]'; 868 $post = ($token->modifier & QST_WILDCARD_END) ? '' : '[[:>:]]'; 869 foreach( array('i.name', 'i.comment') as $field) 870 $clauses[] = $field.' REGEXP \''.$pre.addslashes(preg_quote($variant)).$post.'\''; 871 } 872 else 873 { 874 $ft = $variant; 875 if ($expr->stoken_modifiers[$i] & QST_QUOTED) 876 $ft = '"'.$ft.'"'; 877 if ($expr->stoken_modifiers[$i] & QST_WILDCARD_END) 878 $ft .= '*'; 879 $fts[] = $ft; 880 } 881 } 882 883 if (count($fts)) 884 { 885 $clauses[] = 'MATCH(i.name, i.comment) AGAINST( \''.addslashes(implode(' ',$fts)).'\' IN BOOLEAN MODE)'; 886 } 871 $clauses = array_merge($clauses, qsearch_get_text_token_search_sql($token, array('name','comment'))); 887 872 break; 888 873 … … 930 915 function qsearch_get_tags(QExpression $expr, QResults $qsr) 931 916 { 932 $tokens = $expr->stokens; 933 $token_modifiers = $expr->stoken_modifiers; 934 935 $token_tag_ids = array_fill(0, count($tokens), array() ); 917 $token_tag_ids = $qsr->tag_iids = array_fill(0, count($expr->stokens), array() ); 936 918 $all_tags = array(); 937 919 938 $token_tag_scores = $token_tag_ids; 939 $transliterated_tokens = array(); 940 foreach ($tokens as $token) 941 { 942 if (!isset($token->scope) || 'tag' == $token->scope->id) 943 { 944 $transliterated_tokens[] = transliterate($token->term); 945 } 946 else 947 { 948 $transliterated_tokens[] = ''; 949 } 950 } 951 952 $query = ' 953 SELECT t.*, COUNT(image_id) AS counter 954 FROM '.TAGS_TABLE.' t 955 INNER JOIN '.IMAGE_TAG_TABLE.' ON id=tag_id 956 GROUP BY id'; 957 $result = pwg_query($query); 958 while ($tag = pwg_db_fetch_assoc($result)) 959 { 960 $transliterated_tag = transliterate($tag['name']); 961 962 // find how this tag matches query tokens 963 for ($i=0; $i<count($tokens); $i++) 964 { 965 $transliterated_token = $transliterated_tokens[$i]; 966 if (strlen($transliterated_token)==0) 967 continue; 968 969 $match = false; 970 $pos = 0; 971 while ( ($pos = strpos($transliterated_tag, $transliterated_token, $pos)) !== false) 972 { 973 if ( ($token_modifiers[$i]&QST_WILDCARD)==QST_WILDCARD ) 974 {// wildcard in this token 975 $match = 1; 976 break; 977 } 978 $token_len = strlen($transliterated_token); 979 980 // search begin of word 981 $wbegin_len=0; $wbegin_char=' '; 982 while ($pos-$wbegin_len > 0) 983 { 984 if (! is_word_char($transliterated_tag[$pos-$wbegin_len-1]) ) 985 { 986 $wbegin_char = $transliterated_tag[$pos-$wbegin_len-1]; 987 break; 988 } 989 $wbegin_len++; 990 } 991 992 // search end of word 993 $wend_len=0; $wend_char=' '; 994 while ($pos+$token_len+$wend_len < strlen($transliterated_tag)) 995 { 996 if (! is_word_char($transliterated_tag[$pos+$token_len+$wend_len]) ) 997 { 998 $wend_char = $transliterated_tag[$pos+$token_len+$wend_len]; 999 break; 1000 } 1001 $wend_len++; 1002 } 1003 1004 $this_score = 0; 1005 if ( ($token_modifiers[$i]&QST_WILDCARD)==0 ) 1006 {// no wildcard begin or end 1007 if ($token_len <= 2) 1008 {// search for 1 or 2 characters must match exactly to avoid retrieving too much data 1009 if ($wbegin_len==0 && $wend_len==0 && !is_odd_wbreak_begin($wbegin_char) && !is_odd_wbreak_end($wend_char) ) 1010 $this_score = 1; 1011 } 1012 elseif ($token_len == 3) 1013 { 1014 if ($wbegin_len==0) 1015 $this_score = $token_len / ($token_len + $wend_len); 1016 } 1017 else 1018 { 1019 $this_score = $token_len / ($token_len + 1.1 * $wbegin_len + 0.9 * $wend_len); 1020 } 1021 } 1022 1023 if ($this_score>0) 1024 $match = max($match, $this_score ); 1025 $pos++; 1026 } 1027 1028 if ($match) 1029 { 1030 $tag_id = (int)$tag['id']; 1031 $all_tags[$tag_id] = $tag; 1032 $token_tag_ids[$i][] = $tag_id; 1033 $token_tag_scores[$i][] = $match; 1034 } 1035 } 1036 } 1037 1038 // process tags 1039 $not_tag_ids = array(); 1040 for ($i=0; $i<count($tokens); $i++) 1041 { 1042 array_multisort($token_tag_scores[$i], SORT_DESC|SORT_NUMERIC, $token_tag_ids[$i]); 1043 $is_not = $token_modifiers[$i]&QST_NOT; 1044 $counter = 0; 1045 1046 for ($j=0; $j<count($token_tag_scores[$i]); $j++) 1047 { 1048 if ($is_not) 1049 { 1050 if ($token_tag_scores[$i][$j] < 0.8 || 1051 ($j>0 && $token_tag_scores[$i][$j] < $token_tag_scores[$i][0]) ) 1052 { 1053 array_splice($token_tag_scores[$i], $j); 1054 array_splice($token_tag_ids[$i], $j); 1055 } 1056 } 1057 else 1058 { 1059 $tag_id = $token_tag_ids[$i][$j]; 1060 $counter += $all_tags[$tag_id]['counter']; 1061 if ( $j>0 && ( 1062 ($counter > 100 && $token_tag_scores[$i][0] > $token_tag_scores[$i][$j]) // "many" images in previous tags and starting from this tag is less relevant 1063 || ($token_tag_scores[$i][0]==1 && $token_tag_scores[$i][$j]<0.8) 1064 || ($token_tag_scores[$i][0]>0.8 && $token_tag_scores[$i][$j]<0.5) 1065 )) 1066 {// we remove this tag from the results, but we still leave it in all_tags list so that if we are wrong, the user chooses it 1067 array_splice($token_tag_ids[$i], $j); 1068 array_splice($token_tag_scores[$i], $j); 1069 break; 1070 } 1071 } 1072 } 1073 1074 if ($is_not) 1075 { 1076 $not_tag_ids = array_merge($not_tag_ids, $token_tag_ids[$i]); 1077 } 1078 } 1079 1080 $all_tags = array_diff_key($all_tags, array_flip($not_tag_ids)); 1081 usort($all_tags, 'tag_alpha_compare'); 1082 foreach ( $all_tags as &$tag ) 1083 { 1084 $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag); 1085 } 1086 $qsr->all_tags = $all_tags; 1087 1088 $qsr->tag_ids = $token_tag_ids; 1089 $qsr->tag_iids = array_fill(0, count($tokens), array() ); 1090 1091 for ($i=0; $i<count($tokens); $i++) 920 for ($i=0; $i<count($expr->stokens); $i++) 921 { 922 $token = $expr->stokens[$i]; 923 if (isset($token->scope) && 'tag' != $token->scope->id) 924 continue; 925 if (empty($token->term)) 926 continue; 927 928 $clauses = qsearch_get_text_token_search_sql( $token, array('name')); 929 $query = 'SELECT * FROM '.TAGS_TABLE.' 930 WHERE ('. implode("\n OR ",$clauses) .')'; 931 $result = pwg_query($query); 932 while ($tag = pwg_db_fetch_assoc($result)) 933 { 934 $token_tag_ids[$i][] = $tag['id']; 935 $all_tags[$tag['id']] = $tag; 936 } 937 } 938 939 // check adjacent short words 940 for ($i=0; $i<count($expr->stokens)-1; $i++) 941 { 942 if ( (strlen($expr->stokens[$i])<=3 || strlen($expr->stokens[$i+1])<=3) 943 && (($expr->stoken_modifiers[$i] & (QST_QUOTED|QST_WILDCARD)) == 0) 944 && (($expr->stoken_modifiers[$i+1] & (QST_BREAK|QST_QUOTED|QST_WILDCARD)) == 0) ) 945 { 946 $common = array_intersect( $token_tag_ids[$i], $token_tag_ids[$i+1] ); 947 if (count($common)) 948 { 949 $token_tag_ids[$i] = $token_tag_ids[$i+1] = $common; 950 } 951 } 952 } 953 954 // get images 955 $positive_ids = $not_ids = array(); 956 for ($i=0; $i<count($expr->stokens); $i++) 1092 957 { 1093 958 $tag_ids = $token_tag_ids[$i]; 959 $token = $expr->stokens[$i]; 1094 960 1095 961 if (!empty($tag_ids)) … … 1100 966 GROUP BY image_id'; 1101 967 $qsr->tag_iids[$i] = query2array($query, null, 'image_id'); 1102 } 1103 elseif (isset($tokens[$i]->scope) && 'tag' == $tokens[$i]->scope->id && strlen($token->term)==0) 968 if ($expr->stoken_modifiers[$i]&QST_NOT) 969 $not_ids = array_merge($not_ids, $tag_ids); 970 else 971 $positive_ids = array_merge($positive_ids, $tag_ids); 972 } 973 elseif (isset($token->scope) && 'tag' == $token->scope->id && strlen($token->term)==0) 1104 974 { 1105 975 if ($tokens[$i]->modifier & QST_WILDCARD) … … 1113 983 } 1114 984 } 1115 } 985 986 $all_tags = array_intersect_key($all_tags, array_flip( array_diff($positive_ids, $not_ids) ) ); 987 usort($all_tags, 'tag_alpha_compare'); 988 foreach ( $all_tags as &$tag ) 989 { 990 $tag['name'] = trigger_event('render_tag_name', $tag['name'], $tag); 991 } 992 $qsr->all_tags = $all_tags; 993 $qsr->tag_ids = $token_tag_ids; 994 } 995 1116 996 1117 997 … … 1260 1140 { 1261 1141 $debug[] = $expression->stokens[$i].': '.count($qsr->tag_ids[$i]).' tags, '.count($qsr->tag_iids[$i]).' tiids, '.count($qsr->images_iids[$i]).' iiids, '.count($qsr->iids[$i]).' iids' 1142 .' modifier:'.dechex($expression->stoken_modifiers[$i]) 1262 1143 .( !empty($expression->stokens[$i]->variants) ? ' variants: '.implode(', ',$expression->stokens[$i]->variants): ''); 1263 1144 }
Note: See TracChangeset
for help on using the changeset viewer.