mirror of
https://github.com/mdbtools/mdbtools.git
synced 2025-04-05 20:31:00 +08:00
Postgres-style ILIKE operator (with Unicode support) (#244)
Access's `LIKE` is actually case-insensitive, but to prevent breaking existing programs that rely on mdbtools' case-sensitive behavior, introduce a new `ILIKE` operator to perform a case-insensitive match. Use GLib's `g_utf8_casefold` to make the comparison UTF-8 aware. A "poor man's" version is implemented in fakeglib, which relies on `towlower`, and won't work with multi-grapheme case transformations (e.g. German Eszett). Fixes #233
This commit is contained in:
parent
1b147b8d29
commit
a44a8ed8ae
@ -50,7 +50,7 @@ SQL LANGUAGE
|
||||
|
||||
limit clause: LIMIT <integer>
|
||||
|
||||
operator: =, =>, =<, <>, like, <, >
|
||||
operator: =, =>, =<, <>, like, ilike, <, >
|
||||
|
||||
literal: integers, floating point numbers, or string literal in single quotes
|
||||
|
||||
@ -63,6 +63,10 @@ NOTES
|
||||
|
||||
The -i command can be passed the string 'stdin' to test entering text as if using a pipe.
|
||||
|
||||
The 'like' operator performs a case-sensitive pattern match, with ANSI-style wildcards. An underscore in the pattern will match any single character, and a percent sign will match any run of characters.
|
||||
|
||||
The 'ilike' operator is similar, but performs a case-insensitive pattern match.
|
||||
|
||||
ENVIRONMENT
|
||||
LC_COLLATE Defines the locale for string-comparison operations. See locale(1).
|
||||
MDB_JET3_CHARSET Defines the charset of the input JET3 (access 97) file. Default is CP1252. See iconv(1).
|
||||
|
@ -144,6 +144,7 @@ void g_printerr(const gchar *format, ...);
|
||||
gint g_unichar_to_utf8(gunichar c, gchar *dst);
|
||||
gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
||||
size_t *bytes_read, size_t *bytes_written, GError **error);
|
||||
gchar *g_utf8_casefold(const gchar *str, gssize len);
|
||||
gchar *g_utf8_strdown(const gchar *str, gssize len);
|
||||
|
||||
/* GString */
|
||||
|
@ -129,7 +129,8 @@ enum {
|
||||
MDB_LTEQ,
|
||||
MDB_LIKE,
|
||||
MDB_ISNULL,
|
||||
MDB_NOTNULL
|
||||
MDB_NOTNULL,
|
||||
MDB_ILIKE,
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
@ -164,6 +165,7 @@ enum {
|
||||
x == MDB_GTEQ || \
|
||||
x == MDB_LTEQ || \
|
||||
x == MDB_LIKE || \
|
||||
x == MDB_ILIKE || \
|
||||
x == MDB_ISNULL || \
|
||||
x == MDB_NOTNULL )
|
||||
|
||||
@ -611,6 +613,7 @@ void mdb_dump_stats(MdbHandle *mdb);
|
||||
|
||||
/* like.c */
|
||||
int mdb_like_cmp(char *s, char *r);
|
||||
int mdb_ilike_cmp(char *s, char *r);
|
||||
|
||||
/* write.c */
|
||||
void mdb_put_int16(void *buf, guint32 offset, guint32 value);
|
||||
|
@ -244,11 +244,16 @@ gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
||||
size_t *bytes_read, size_t *bytes_written, GError **error) {
|
||||
if (len == (size_t)-1)
|
||||
len = strlen(opsysstring);
|
||||
wchar_t *utf16 = malloc(sizeof(wchar_t)*(len+1));
|
||||
if (mbstowcs(utf16, opsysstring, len+1) == (size_t)-1) {
|
||||
free(utf16);
|
||||
return g_strndup(opsysstring, len);
|
||||
size_t wlen = mbstowcs(NULL, opsysstring, 0);
|
||||
if (wlen == (size_t)-1) {
|
||||
if (error) {
|
||||
*error = malloc(sizeof(GError));
|
||||
(*error)->message = g_strdup_printf("Invalid multibyte string: %s\n", opsysstring);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
wchar_t *utf16 = malloc(sizeof(wchar_t)*(wlen+1));
|
||||
mbstowcs(utf16, opsysstring, wlen+1);
|
||||
gchar *utf8 = malloc(3*len+1);
|
||||
gchar *dst = utf8;
|
||||
for (size_t i=0; i<len; i++) {
|
||||
@ -260,6 +265,10 @@ gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
||||
return utf8;
|
||||
}
|
||||
|
||||
gchar *g_utf8_casefold(const gchar *str, gssize len) {
|
||||
return g_utf8_strdown(str, len);
|
||||
}
|
||||
|
||||
gchar *g_utf8_strdown(const gchar *str, gssize len) {
|
||||
gssize i = 0;
|
||||
if (len == -1)
|
||||
@ -547,11 +556,10 @@ gboolean g_option_context_parse(GOptionContext *context,
|
||||
while ((c = getopt_long(*argc, *argv, short_opts, long_opts, &longindex)) != -1) {
|
||||
if (c == '?') {
|
||||
*error = malloc(sizeof(GError));
|
||||
(*error)->message = malloc(100);
|
||||
if (optopt) {
|
||||
snprintf((*error)->message, 100, "Unrecognized option: -%c", optopt);
|
||||
(*error)->message = g_strdup_printf("Unrecognized option: -%c", optopt);
|
||||
} else {
|
||||
snprintf((*error)->message, 100, "Unrecognized option: %s", (*argv)[optind-1]);
|
||||
(*error)->message = g_strdup_printf("Unrecognized option: %s", (*argv)[optind-1]);
|
||||
}
|
||||
free(short_opts);
|
||||
free(long_opts);
|
||||
|
@ -1014,7 +1014,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
||||
|
||||
/*
|
||||
* a like with a wild card first is useless as a sarg */
|
||||
if (sarg->op == MDB_LIKE && sarg->value.s[0]=='%')
|
||||
if ((sarg->op == MDB_LIKE || sarg->op == MDB_ILIKE) && sarg->value.s[0]=='%')
|
||||
return 0;
|
||||
|
||||
/*
|
||||
@ -1027,6 +1027,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
||||
case MDB_EQUAL:
|
||||
return 1; break;
|
||||
case MDB_LIKE:
|
||||
case MDB_ILIKE:
|
||||
return 4; break;
|
||||
case MDB_ISNULL:
|
||||
return 12; break;
|
||||
@ -1040,6 +1041,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
||||
else return 1;
|
||||
break;
|
||||
case MDB_LIKE:
|
||||
case MDB_ILIKE:
|
||||
return 6; break;
|
||||
case MDB_ISNULL:
|
||||
return 12; break;
|
||||
@ -1053,6 +1055,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
||||
case MDB_EQUAL:
|
||||
return 2; break;
|
||||
case MDB_LIKE:
|
||||
case MDB_ILIKE:
|
||||
return 5; break;
|
||||
case MDB_ISNULL:
|
||||
return 12; break;
|
||||
@ -1066,6 +1069,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
||||
else return 2;
|
||||
break;
|
||||
case MDB_LIKE:
|
||||
case MDB_ILIKE:
|
||||
return 7; break;
|
||||
case MDB_ISNULL:
|
||||
return 12; break;
|
||||
|
@ -39,11 +39,7 @@ int mdb_like_cmp(char *s, char *r)
|
||||
mdb_debug(MDB_DEBUG_LIKE, "comparing %s and %s", s, r);
|
||||
switch (r[0]) {
|
||||
case '\0':
|
||||
if (s[0]=='\0') {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return (s[0]=='\0');
|
||||
case '_':
|
||||
/* skip one character */
|
||||
return mdb_like_cmp(&s[1],&r[1]);
|
||||
@ -71,3 +67,25 @@ int mdb_like_cmp(char *s, char *r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param s: String to search within.
|
||||
* @param r: Case-insensitive search pattern.
|
||||
*
|
||||
* Tests the string @s to see if it matches the search pattern @r without
|
||||
* regard to case; this mimics the behavior of the Access LIKE operator. In the
|
||||
* search pattern, a percent sign indicates matching on any number of
|
||||
* characters, and an underscore indicates matching any single character.
|
||||
*
|
||||
* @Returns: 1 if the string matches, 0 if the string does not match.
|
||||
*/
|
||||
int mdb_ilike_cmp(char *s, char *r) {
|
||||
char *s1 = g_utf8_casefold(s, -1);
|
||||
char *r1 = g_utf8_casefold(r, -1);
|
||||
int result = mdb_like_cmp(s1, r1);
|
||||
g_free(s1);
|
||||
g_free(r1);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -47,6 +47,9 @@ int rc;
|
||||
if (node->op == MDB_LIKE) {
|
||||
return mdb_like_cmp(s,node->value.s);
|
||||
}
|
||||
if (node->op == MDB_ILIKE) {
|
||||
return mdb_ilike_cmp(s,node->value.s);
|
||||
}
|
||||
rc = strcoll(node->value.s, s);
|
||||
switch (node->op) {
|
||||
case MDB_EQUAL:
|
||||
|
@ -71,6 +71,7 @@ null { return NUL; }
|
||||
"<" { return LT; }
|
||||
">" { return GT; }
|
||||
like { return LIKE; }
|
||||
ilike { return ILIKE; }
|
||||
limit { return LIMIT; }
|
||||
top { return TOP; }
|
||||
percent { return PERCENT; }
|
||||
|
@ -324,6 +324,9 @@ mdb_sql_dump_node(MdbSargNode *node, int level)
|
||||
case MDB_LIKE:
|
||||
printf(" like %s\n", node->value.s);
|
||||
break;
|
||||
case MDB_ILIKE:
|
||||
printf(" ilike %s\n", node->value.s);
|
||||
break;
|
||||
case MDB_EQUAL:
|
||||
printf(" = %d\n", node->value.i);
|
||||
break;
|
||||
@ -398,6 +401,7 @@ mdb_sql_eval_expr(MdbSQL *sql, char *const1, int op, char *const2)
|
||||
case MDB_LT: compar = (value < 0); break;
|
||||
case MDB_LTEQ: compar = (value <= 0); break;
|
||||
case MDB_LIKE: compar = mdb_like_cmp(const1,const2); break;
|
||||
case MDB_ILIKE: compar = mdb_ilike_cmp(const1,const2); break;
|
||||
default: illop = 1;
|
||||
}
|
||||
} else if (const1[0]!='\'' && const2[0]!='\'') {
|
||||
|
@ -63,7 +63,7 @@ typedef struct sql_context
|
||||
%token <name> IDENT NAME PATH STRING NUMBER OPENING CLOSING
|
||||
%token SELECT FROM WHERE CONNECT DISCONNECT TO LIST TABLES AND OR NOT LIMIT COUNT STRPTIME
|
||||
%token DESCRIBE TABLE TOP PERCENT
|
||||
%token LTEQ GTEQ LIKE IS NUL
|
||||
%token LTEQ GTEQ LIKE ILIKE IS NUL
|
||||
|
||||
%type <name> database
|
||||
%type <name> constant
|
||||
@ -81,7 +81,7 @@ typedef struct sql_context
|
||||
%left OR
|
||||
%left AND
|
||||
%right NOT
|
||||
%left EQ LTEQ GTEQ LT GT LIKE IS
|
||||
%left EQ LTEQ GTEQ LT GT LIKE ILIKE IS
|
||||
|
||||
%%
|
||||
|
||||
@ -193,6 +193,7 @@ operator:
|
||||
| LTEQ { $$ = MDB_LTEQ; }
|
||||
| GTEQ { $$ = MDB_GTEQ; }
|
||||
| LIKE { $$ = MDB_LIKE; }
|
||||
| ILIKE { $$ = MDB_ILIKE; }
|
||||
;
|
||||
|
||||
nulloperator:
|
||||
|
@ -413,7 +413,10 @@ main(int argc, char **argv)
|
||||
|
||||
while (1) {
|
||||
line ++;
|
||||
if (s) free(s);
|
||||
if (s) {
|
||||
free(s);
|
||||
s = NULL;
|
||||
}
|
||||
|
||||
if (in) {
|
||||
s=calloc(bufsz, 1);
|
||||
@ -434,9 +437,13 @@ main(int argc, char **argv)
|
||||
s[strlen(s)-1]=0;
|
||||
} else {
|
||||
snprintf(prompt, sizeof(prompt), "%d => ", line);
|
||||
s=readline(prompt);
|
||||
if (!s)
|
||||
locale = setlocale(LC_CTYPE, "");
|
||||
char *l = readline(prompt);
|
||||
setlocale(LC_CTYPE, locale);
|
||||
if (!l)
|
||||
break;
|
||||
s=g_locale_to_utf8(l, -1, NULL, NULL, NULL);
|
||||
free(l);
|
||||
}
|
||||
|
||||
if (!strcmp(s,"exit") || !strcmp(s,"quit") || !strcmp(s,"bye"))
|
||||
|
Loading…
Reference in New Issue
Block a user