mdbtools/src/libmdb/iconv.c
Evan Miller fb6637c503
Fix unused but set variable warning (#221)
Refactor mdb_unicode2ascii to eliminate warnings
2020-12-19 09:37:39 -05:00

261 lines
6.7 KiB
C

/* MDB Tools - A library for reading MS Access database files
* Copyright (C) 2000 Brian Bruns
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <errno.h>
#include "mdbtools.h"
#ifndef MIN
#define MIN(a,b) (a>b ? b : a)
#endif
static size_t decompress_unicode(const char *src, size_t slen, char *dst, size_t dlen) {
unsigned int compress=1;
size_t tlen = 0;
while (slen > 0 && tlen < dlen) {
if (*src == 0) {
compress = (compress) ? 0 : 1;
src++;
slen--;
} else if (compress) {
dst[tlen++] = *src++;
dst[tlen++] = 0;
slen--;
} else if (slen >= 2){
dst[tlen++] = *src++;
dst[tlen++] = *src++;
slen-=2;
} else { // Odd # of bytes
break;
}
}
return tlen;
}
#if HAVE_ICONV
static size_t decompressed2ascii_with_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) {
char *out_ptr = dest;
size_t len_out = dlen - 1;
while (1) {
iconv(mdb->iconv_in, (ICONV_CONST char **)&in_ptr, &len_in, &out_ptr, &len_out);
/*
* Have seen database with odd number of bytes in UCS-2, shouldn't happen but protect against it
*/
if (!IS_JET3(mdb) && len_in<=1) {
//fprintf(stderr, "Detected invalid number of UCS-2 bytes\n");
break;
}
if ((!len_in) || (errno == E2BIG)) break;
/* Don't bail if impossible conversion is encountered */
in_ptr += (IS_JET3(mdb)) ? 1 : 2;
len_in -= (IS_JET3(mdb)) ? 1 : 2;
*out_ptr++ = '?';
len_out--;
}
dlen -= len_out + 1;
dest[dlen] = '\0';
return dlen;
}
#else
static size_t decompressed2ascii_without_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) {
if (IS_JET3(mdb)) {
int count = 0;
snprintf(dest, dlen, "%.*s%n", (int)len_in, in_ptr, &count);
return count;
}
/* rough UCS-2LE to ISO-8859-1 conversion */
/* wcstombs would be better; see libxls implementation for
* a multi-platform solution */
unsigned int i;
for (i=0; 2*i+1<len_in && i<dlen-1; i++)
dest[i] = (in_ptr[2*i+1] == 0) ? in_ptr[2*i] : '?';
dest[i] = '\0';
return i;
}
#endif
/*
* This function is used in reading text data from an MDB table.
* 'dest' will receive a converted, null-terminated string.
* dlen is the available size of the destination buffer.
* Returns the length of the converted string, not including the terminator.
*/
int
mdb_unicode2ascii(MdbHandle *mdb, const char *src, size_t slen, char *dest, size_t dlen)
{
char *tmp = NULL;
size_t len_in;
const char *in_ptr = NULL;
if ((!src) || (!dest) || (!dlen))
return 0;
/* Uncompress 'Unicode Compressed' string into tmp */
if (!IS_JET3(mdb) && (slen>=2)
&& ((src[0]&0xff)==0xff) && ((src[1]&0xff)==0xfe)) {
tmp = (char *)g_malloc(slen*2);
len_in = decompress_unicode(src + 2, slen - 2, tmp, slen * 2);
in_ptr = tmp;
} else {
len_in = slen;
in_ptr = src;
}
#if HAVE_ICONV
dlen = decompressed2ascii_with_iconv(mdb, in_ptr, len_in, dest, dlen);
#else
dlen = decompressed2ascii_without_iconv(mdb, in_ptr, len_in, dest, dlen);
#endif
if (tmp) g_free(tmp);
return dlen;
}
/*
* This function is used in writing text data to an MDB table.
* If slen is 0, strlen will be used to calculate src's length.
*/
int
mdb_ascii2unicode(MdbHandle *mdb, const char *src, size_t slen, char *dest, size_t dlen)
{
size_t len_in, len_out;
const char *in_ptr = NULL;
char *out_ptr = NULL;
if ((!src) || (!dest) || (!dlen))
return 0;
in_ptr = src;
out_ptr = dest;
len_in = (slen) ? slen : strlen(in_ptr);
len_out = dlen;
#ifdef HAVE_ICONV
iconv(mdb->iconv_out, (ICONV_CONST char **)&in_ptr, &len_in, &out_ptr, &len_out);
//printf("len_in %d len_out %d\n", len_in, len_out);
dlen -= len_out;
#else
if (IS_JET3(mdb)) {
dlen = MIN(len_in, len_out);
strncpy(out_ptr, in_ptr, dlen);
} else {
unsigned int i;
slen = MIN(len_in, len_out/2);
dlen = slen*2;
for (i=0; i<slen; i++) {
out_ptr[i*2] = in_ptr[i];
out_ptr[i*2+1] = 0;
}
}
#endif
/* Unicode Compression */
if(!IS_JET3(mdb) && (dlen>4)) {
unsigned char *tmp = g_malloc(dlen);
unsigned int tptr = 0, dptr = 0;
int comp = 1;
tmp[tptr++] = 0xff;
tmp[tptr++] = 0xfe;
while((dptr < dlen) && (tptr < dlen)) {
if (((dest[dptr+1]==0) && (comp==0))
|| ((dest[dptr+1]!=0) && (comp==1))) {
/* switch encoding mode */
tmp[tptr++] = 0;
comp = (comp) ? 0 : 1;
} else if (dest[dptr]==0) {
/* this string cannot be compressed */
tptr = dlen;
} else if (comp==1) {
/* encode compressed character */
tmp[tptr++] = dest[dptr];
dptr += 2;
} else if (tptr+1 < dlen) {
/* encode uncompressed character */
tmp[tptr++] = dest[dptr];
tmp[tptr++] = dest[dptr+1];
dptr += 2;
} else {
/* could not encode uncompressed character
* into single byte */
tptr = dlen;
}
}
if (tptr < dlen) {
memcpy(dest, tmp, tptr);
dlen = tptr;
}
g_free(tmp);
}
return dlen;
}
const char*
mdb_target_charset(MdbHandle *mdb)
{
#ifdef HAVE_ICONV
const char *iconv_code = getenv("MDBICONV");
if (!iconv_code)
iconv_code = "UTF-8";
return iconv_code;
#else
if (!IS_JET3(mdb))
return "ISO-8859-1";
return NULL; // same as input: unknown
#endif
}
void mdb_iconv_init(MdbHandle *mdb)
{
const char *iconv_code;
/* check environment variable */
if (!(iconv_code=getenv("MDBICONV"))) {
iconv_code="UTF-8";
}
#ifdef HAVE_ICONV
if (!IS_JET3(mdb)) {
mdb->iconv_out = iconv_open("UCS-2LE", iconv_code);
mdb->iconv_in = iconv_open(iconv_code, "UCS-2LE");
} else {
/* According to Microsoft Knowledge Base pages 289525 and */
/* 202427, code page info is not contained in the database */
const char *jet3_iconv_code;
/* check environment variable */
if (!(jet3_iconv_code=getenv("MDB_JET3_CHARSET"))) {
jet3_iconv_code="CP1252";
}
mdb->iconv_out = iconv_open(jet3_iconv_code, iconv_code);
mdb->iconv_in = iconv_open(iconv_code, jet3_iconv_code);
}
#endif
}
void mdb_iconv_close(MdbHandle *mdb)
{
#ifdef HAVE_ICONV
if (mdb->iconv_out != (iconv_t)-1) iconv_close(mdb->iconv_out);
if (mdb->iconv_in != (iconv_t)-1) iconv_close(mdb->iconv_in);
#endif
}