[Date Prev][Date Next]
[Chronological]
[Thread]
[Top]
Add flag to UTF8normalize and pals to allow accent stripping
- To: "'OpenLDAP DEVEL'" <openldap-devel@OpenLDAP.org>
- Subject: Add flag to UTF8normalize and pals to allow accent stripping
- From: "John Hughes" <john@Calva.COM>
- Date: Mon, 25 Feb 2002 11:00:34 +0100
- Importance: Normal
Some of us are too lazy to type accents when we do searches.
Here's a patch to UTF8normalize to make it strip accents if
the caller wants.
This could be used to implement accent free searches.
(On reflection maybe the stripping should be done inside
uccanondecomp, but that's not how I wrote it).
Index: include/ldap_pvt_uc.h
===================================================================
RCS file: /repo/OpenLDAP/pkg/ldap/include/ldap_pvt_uc.h,v
retrieving revision 1.16
diff -u -r1.16 ldap_pvt_uc.h
--- include/ldap_pvt_uc.h 2002/02/14 15:01:48 1.16
+++ include/ldap_pvt_uc.h 2002/02/24 12:31:22
@@ -137,6 +137,7 @@
ldap_unicode_t *,
ber_len_t );
+#define LDAP_UTF8_STRIPACCENT 0x2U
#define LDAP_UTF8_CASEFOLD 0x1U
#define LDAP_UTF8_NOCASEFOLD 0x0U
Index: libraries/liblunicode/ucstr.c
===================================================================
RCS file: /repo/OpenLDAP/pkg/ldap/libraries/liblunicode/ucstr.c,v
retrieving revision 1.15
diff -u -r1.15 ucstr.c
--- libraries/liblunicode/ucstr.c 2002/02/14 13:03:27 1.15
+++ libraries/liblunicode/ucstr.c 2002/02/24 12:31:23
@@ -6,6 +6,8 @@
#include "portable.h"
+#include <stdio.h>
+
#include <ac/ctype.h>
#include <ac/string.h>
#include <ac/stdlib.h>
@@ -95,12 +97,14 @@
char * UTF8normalize(
struct berval *bv,
- unsigned casefold )
+ unsigned flags )
{
int i, j, len, clen, outpos, ucsoutlen, outsize, last;
char *out, *s;
unsigned long *ucs, *p, *ucsout;
-
+ int casefold = flags & LDAP_UTF8_CASEFOLD;
+ int strip = flags & LDAP_UTF8_STRIPACCENT;
+
static unsigned char mask[] = {
0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
@@ -202,6 +206,15 @@
}
/* normalize ucs of length p - ucs */
uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
+ if (strip) {
+ int in,ex;
+ for (in = 1, ex = 1; in < ucsoutlen; ++in) {
+ if (ucisnonspacing (ucsout[in])) continue;
+ ucsout[ex] = ucsout[in];
+ ++ex;
+ }
+ ucsoutlen = ex;
+ }
ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
/* convert ucs to utf-8 and store in out */
for ( j = 0; j < ucsoutlen; j++ ) {
@@ -246,11 +259,13 @@
struct berval * UTF8bvnormalize(
struct berval *bv,
struct berval *newbv,
- unsigned casefold )
+ unsigned flags )
{
int i, j, len, clen, outpos, ucsoutlen, outsize, last;
char *out, *s;
unsigned long *ucs, *p, *ucsout;
+ int casefold = flags & LDAP_UTF8_CASEFOLD;
+ int strip = flags & LDAP_UTF8_STRIPACCENT;
static unsigned char mask[] = {
0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
@@ -362,6 +377,15 @@
}
/* normalize ucs of length p - ucs */
uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
+ if (strip) {
+ int in,ex;
+ for (in = 1, ex = 1; in < ucsoutlen; ++in) {
+ if (ucisnonspacing (ucsout[in])) continue;
+ ucsout[ex] = ucsout[in];
+ ++ex;
+ }
+ ucsoutlen = ex;
+ }
ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
/* convert ucs to utf-8 and store in out */
for ( j = 0; j < ucsoutlen; j++ ) {
@@ -408,10 +432,12 @@
int UTF8normcmp(
const char *s1,
const char *s2,
- unsigned casefold )
+ unsigned flags )
{
int i, l1, l2, len, ulen, res;
unsigned long *ucs, *ucsout1, *ucsout2;
+ int casefold = flags & LDAP_UTF8_CASEFOLD;
+ int strip = flags & LDAP_UTF8_STRIPACCENT;
l1 = strlen( s1 );
l2 = strlen( s2 );
@@ -467,6 +493,15 @@
len = LDAP_UTF8_CHARLEN( s1 + i );
}
uccanondecomp( ucs, ulen, &ucsout1, &l1 );
+ if (strip) {
+ int in,ex;
+ for (in = 1, ex = 1; in < l1; ++in) {
+ if (ucisnonspacing (ucsout1[in])) continue;
+ ucsout1[ex] = ucsout1[in];
+ ++ex;
+ }
+ l1 = ex;
+ }
l1 = uccanoncomp( ucsout1, l1 );
/* convert and normalize 2nd string */
@@ -480,6 +515,15 @@
len = LDAP_UTF8_CHARLEN( s2 + i );
}
uccanondecomp( ucs, ulen, &ucsout2, &l2 );
+ if (strip) {
+ int in,ex;
+ for (in = 1, ex = 1; in < l2; ++in) {
+ if (ucisnonspacing (ucsout2[in])) continue;
+ ucsout2[ex] = ucsout2[in];
+ ++ex;
+ }
+ l2 = ex;
+ }
l2 = uccanoncomp( ucsout2, l2 );
free( ucs );