Postgres-style ILIKE operator (with Unicode support) (#244)

Access's `LIKE` is actually case-insensitive, but to prevent breaking existing
programs that rely on mdbtools' case-sensitive behavior, introduce a new
`ILIKE` operator to perform a case-insensitive match. Use GLib's `g_utf8_casefold`
to make the comparison UTF-8 aware. A "poor man's" version is implemented
in fakeglib, which relies on `towlower`, and won't work with multi-grapheme
case transformations (e.g. German Eszett).

Fixes #233
This commit is contained in:
Evan Miller 2021-08-04 14:45:31 -04:00 committed by GitHub
parent 1b147b8d29
commit a44a8ed8ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 74 additions and 20 deletions

View File

@ -50,7 +50,7 @@ SQL LANGUAGE
limit clause: LIMIT <integer>
operator: =, =>, =<, <>, like, <, >
operator: =, =>, =<, <>, like, ilike, <, >
literal: integers, floating point numbers, or string literal in single quotes
@ -63,6 +63,10 @@ NOTES
The -i command can be passed the string 'stdin' to test entering text as if using a pipe.
The 'like' operator performs a case-sensitive pattern match, with ANSI-style wildcards. An underscore in the pattern will match any single character, and a percent sign will match any run of characters.
The 'ilike' operator is similar, but performs a case-insensitive pattern match.
ENVIRONMENT
LC_COLLATE Defines the locale for string-comparison operations. See locale(1).
MDB_JET3_CHARSET Defines the charset of the input JET3 (access 97) file. Default is CP1252. See iconv(1).

View File

@ -144,6 +144,7 @@ void g_printerr(const gchar *format, ...);
gint g_unichar_to_utf8(gunichar c, gchar *dst);
gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
size_t *bytes_read, size_t *bytes_written, GError **error);
gchar *g_utf8_casefold(const gchar *str, gssize len);
gchar *g_utf8_strdown(const gchar *str, gssize len);
/* GString */

View File

@ -129,7 +129,8 @@ enum {
MDB_LTEQ,
MDB_LIKE,
MDB_ISNULL,
MDB_NOTNULL
MDB_NOTNULL,
MDB_ILIKE,
};
typedef enum {
@ -164,6 +165,7 @@ enum {
x == MDB_GTEQ || \
x == MDB_LTEQ || \
x == MDB_LIKE || \
x == MDB_ILIKE || \
x == MDB_ISNULL || \
x == MDB_NOTNULL )
@ -611,6 +613,7 @@ void mdb_dump_stats(MdbHandle *mdb);
/* like.c */
int mdb_like_cmp(char *s, char *r);
int mdb_ilike_cmp(char *s, char *r);
/* write.c */
void mdb_put_int16(void *buf, guint32 offset, guint32 value);

View File

@ -244,11 +244,16 @@ gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
size_t *bytes_read, size_t *bytes_written, GError **error) {
if (len == (size_t)-1)
len = strlen(opsysstring);
wchar_t *utf16 = malloc(sizeof(wchar_t)*(len+1));
if (mbstowcs(utf16, opsysstring, len+1) == (size_t)-1) {
free(utf16);
return g_strndup(opsysstring, len);
size_t wlen = mbstowcs(NULL, opsysstring, 0);
if (wlen == (size_t)-1) {
if (error) {
*error = malloc(sizeof(GError));
(*error)->message = g_strdup_printf("Invalid multibyte string: %s\n", opsysstring);
}
return NULL;
}
wchar_t *utf16 = malloc(sizeof(wchar_t)*(wlen+1));
mbstowcs(utf16, opsysstring, wlen+1);
gchar *utf8 = malloc(3*len+1);
gchar *dst = utf8;
for (size_t i=0; i<len; i++) {
@ -260,6 +265,10 @@ gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
return utf8;
}
gchar *g_utf8_casefold(const gchar *str, gssize len) {
return g_utf8_strdown(str, len);
}
gchar *g_utf8_strdown(const gchar *str, gssize len) {
gssize i = 0;
if (len == -1)
@ -547,11 +556,10 @@ gboolean g_option_context_parse(GOptionContext *context,
while ((c = getopt_long(*argc, *argv, short_opts, long_opts, &longindex)) != -1) {
if (c == '?') {
*error = malloc(sizeof(GError));
(*error)->message = malloc(100);
if (optopt) {
snprintf((*error)->message, 100, "Unrecognized option: -%c", optopt);
(*error)->message = g_strdup_printf("Unrecognized option: -%c", optopt);
} else {
snprintf((*error)->message, 100, "Unrecognized option: %s", (*argv)[optind-1]);
(*error)->message = g_strdup_printf("Unrecognized option: %s", (*argv)[optind-1]);
}
free(short_opts);
free(long_opts);

View File

@ -1014,7 +1014,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
/*
* a like with a wild card first is useless as a sarg */
if (sarg->op == MDB_LIKE && sarg->value.s[0]=='%')
if ((sarg->op == MDB_LIKE || sarg->op == MDB_ILIKE) && sarg->value.s[0]=='%')
return 0;
/*
@ -1027,6 +1027,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
case MDB_EQUAL:
return 1; break;
case MDB_LIKE:
case MDB_ILIKE:
return 4; break;
case MDB_ISNULL:
return 12; break;
@ -1040,6 +1041,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
else return 1;
break;
case MDB_LIKE:
case MDB_ILIKE:
return 6; break;
case MDB_ISNULL:
return 12; break;
@ -1053,6 +1055,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
case MDB_EQUAL:
return 2; break;
case MDB_LIKE:
case MDB_ILIKE:
return 5; break;
case MDB_ISNULL:
return 12; break;
@ -1066,6 +1069,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
else return 2;
break;
case MDB_LIKE:
case MDB_ILIKE:
return 7; break;
case MDB_ISNULL:
return 12; break;

View File

@ -39,11 +39,7 @@ int mdb_like_cmp(char *s, char *r)
mdb_debug(MDB_DEBUG_LIKE, "comparing %s and %s", s, r);
switch (r[0]) {
case '\0':
if (s[0]=='\0') {
return 1;
} else {
return 0;
}
return (s[0]=='\0');
case '_':
/* skip one character */
return mdb_like_cmp(&s[1],&r[1]);
@ -71,3 +67,25 @@ int mdb_like_cmp(char *s, char *r)
}
}
}
/**
*
* @param s: String to search within.
* @param r: Case-insensitive search pattern.
*
* Tests the string @s to see if it matches the search pattern @r without
* regard to case; this mimics the behavior of the Access LIKE operator. In the
* search pattern, a percent sign indicates matching on any number of
* characters, and an underscore indicates matching any single character.
*
* @Returns: 1 if the string matches, 0 if the string does not match.
*/
int mdb_ilike_cmp(char *s, char *r) {
char *s1 = g_utf8_casefold(s, -1);
char *r1 = g_utf8_casefold(r, -1);
int result = mdb_like_cmp(s1, r1);
g_free(s1);
g_free(r1);
return result;
}

View File

@ -47,6 +47,9 @@ int rc;
if (node->op == MDB_LIKE) {
return mdb_like_cmp(s,node->value.s);
}
if (node->op == MDB_ILIKE) {
return mdb_ilike_cmp(s,node->value.s);
}
rc = strcoll(node->value.s, s);
switch (node->op) {
case MDB_EQUAL:

View File

@ -71,6 +71,7 @@ null { return NUL; }
"<" { return LT; }
">" { return GT; }
like { return LIKE; }
ilike { return ILIKE; }
limit { return LIMIT; }
top { return TOP; }
percent { return PERCENT; }

View File

@ -324,6 +324,9 @@ mdb_sql_dump_node(MdbSargNode *node, int level)
case MDB_LIKE:
printf(" like %s\n", node->value.s);
break;
case MDB_ILIKE:
printf(" ilike %s\n", node->value.s);
break;
case MDB_EQUAL:
printf(" = %d\n", node->value.i);
break;
@ -398,6 +401,7 @@ mdb_sql_eval_expr(MdbSQL *sql, char *const1, int op, char *const2)
case MDB_LT: compar = (value < 0); break;
case MDB_LTEQ: compar = (value <= 0); break;
case MDB_LIKE: compar = mdb_like_cmp(const1,const2); break;
case MDB_ILIKE: compar = mdb_ilike_cmp(const1,const2); break;
default: illop = 1;
}
} else if (const1[0]!='\'' && const2[0]!='\'') {

View File

@ -63,7 +63,7 @@ typedef struct sql_context
%token <name> IDENT NAME PATH STRING NUMBER OPENING CLOSING
%token SELECT FROM WHERE CONNECT DISCONNECT TO LIST TABLES AND OR NOT LIMIT COUNT STRPTIME
%token DESCRIBE TABLE TOP PERCENT
%token LTEQ GTEQ LIKE IS NUL
%token LTEQ GTEQ LIKE ILIKE IS NUL
%type <name> database
%type <name> constant
@ -81,7 +81,7 @@ typedef struct sql_context
%left OR
%left AND
%right NOT
%left EQ LTEQ GTEQ LT GT LIKE IS
%left EQ LTEQ GTEQ LT GT LIKE ILIKE IS
%%
@ -193,6 +193,7 @@ operator:
| LTEQ { $$ = MDB_LTEQ; }
| GTEQ { $$ = MDB_GTEQ; }
| LIKE { $$ = MDB_LIKE; }
| ILIKE { $$ = MDB_ILIKE; }
;
nulloperator:

View File

@ -413,7 +413,10 @@ main(int argc, char **argv)
while (1) {
line ++;
if (s) free(s);
if (s) {
free(s);
s = NULL;
}
if (in) {
s=calloc(bufsz, 1);
@ -434,9 +437,13 @@ main(int argc, char **argv)
s[strlen(s)-1]=0;
} else {
snprintf(prompt, sizeof(prompt), "%d => ", line);
s=readline(prompt);
if (!s)
locale = setlocale(LC_CTYPE, "");
char *l = readline(prompt);
setlocale(LC_CTYPE, locale);
if (!l)
break;
s=g_locale_to_utf8(l, -1, NULL, NULL, NULL);
free(l);
}
if (!strcmp(s,"exit") || !strcmp(s,"quit") || !strcmp(s,"bye"))