diff --git a/ChangeLog b/ChangeLog index 24a68a7de4..5814f1e674 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +1999-12-20 Ulrich Drepper + + * locale/categories.def: Remove most of the collate definitions. + * locale/langinfo.h: Comment out corresponding definitions. + * locale/programs/locale-spec.c (locale_special): Don't recognize the + collate names yet. + * locale/programs/ld-collate.c: Correct and optimize computation of + weights. Set up list of all definitions correctly. Start writing + function to generate output file. + + * locale/programs/ld-ctype.c (allocate_arrays): Increment counter in + loop to compute default mapping. + 1999-12-19 Ulrich Drepper * sysdeps/unix/sysv/linux/powerpc/pread.c: Define __libc_* variant for diff --git a/locale/categories.def b/locale/categories.def index 4a617875d2..d94840c5c2 100644 --- a/locale/categories.def +++ b/locale/categories.def @@ -43,20 +43,6 @@ DEFINE_CATEGORY LC_COLLATE, "LC_COLLATE", ( DEFINE_ELEMENT (_NL_COLLATE_NRULES, "collate-nrules", std, word) - DEFINE_ELEMENT (_NL_COLLATE_RULES, "collate-rules", std, string) - DEFINE_ELEMENT (_NL_COLLATE_HASH_SIZE, "collate-hash-size", std, word) - DEFINE_ELEMENT (_NL_COLLATE_HASH_LAYERS, "collate-hash-layers", std, word) - DEFINE_ELEMENT (_NL_COLLATE_TABLEWC, "collate-tablewc", std, string) - DEFINE_ELEMENT (_NL_COLLATE_UNDEFINED_WC, "collate-undefined-wc", std, word) - DEFINE_ELEMENT (_NL_COLLATE_EXTRAWC, "collate-extrawc", std, string) - DEFINE_ELEMENT (_NL_COLLATE_ELEM_HASH_SIZE, "collate-elem-hash-size", std, word) - DEFINE_ELEMENT (_NL_COLLATE_ELEM_HASH, "collate-elem-hash", std, string) - DEFINE_ELEMENT (_NL_COLLATE_ELEM_STR_POOL, "collate-elem-str-pool", std, string) - DEFINE_ELEMENT (_NL_COLLATE_ELEM_VAL, "collate-elem-val", std, string) - DEFINE_ELEMENT (_NL_COLLATE_SYMB_HASH_SIZE, "collate-symb-hash-size", std, word) - DEFINE_ELEMENT (_NL_COLLATE_SYMB_HASH, "collate-symb-hash", std, string) - DEFINE_ELEMENT (_NL_COLLATE_SYMB_STR_POOL, "collate-symb-str-pool", std, string) - DEFINE_ELEMENT (_NL_COLLATE_SYMB_CLASSWC, "collate-symb-classwc", std, string) ), _nl_postload_collate) diff --git a/locale/langinfo.h b/locale/langinfo.h index 1248a46be8..44f4064708 100644 --- a/locale/langinfo.h +++ b/locale/langinfo.h @@ -231,12 +231,11 @@ enum This information is accessed by the strcoll and strxfrm functions. These `nl_langinfo' names are used only internally. */ _NL_COLLATE_NRULES = _NL_ITEM (LC_COLLATE, 0), - _NL_COLLATE_RULES, +#if 0 + _NL_COLLATE_TABLEMB, _NL_COLLATE_HASH_SIZE, _NL_COLLATE_HASH_LAYERS, - _NL_COLLATE_TABLEMB, _NL_COLLATE_TABLEWC, - _NL_COLLATE_UNDEFINED_MB, _NL_COLLATE_UNDEFINED_WC, _NL_COLLATE_EXTRAMB, _NL_COLLATE_EXTRAWC, @@ -251,6 +250,7 @@ enum _NL_COLLATE_SYMB_STR_POOL, _NL_COLLATE_SYMB_CLASSMB, _NL_COLLATE_SYMB_CLASSWC, +#endif _NL_NUM_LC_COLLATE, /* LC_CTYPE category: character classification. diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c index 87005e86ab..52c4e3c6a1 100644 --- a/locale/programs/ld-collate.c +++ b/locale/programs/ld-collate.c @@ -73,9 +73,17 @@ struct element_t const char *mbs; const uint32_t *wcs; - int mborder; + int *mborder; int wcorder; + /* The following is a bit mask which bits are set if this element is + used in the appropriate level. Interesting for the singlebyte + weight computation. + + XXX The type here restricts the number of levels to 32. It could + we changed if necessary but I doubt this is necessary. */ + unsigned int used_in_level; + struct element_list_t *weights; /* Where does the definition come from. */ @@ -191,8 +199,8 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen, newp = (struct element_t *) obstack_alloc (&collate->mempool, sizeof (*newp)); - newp->name = name == NULL ? NULL : obstack_copy (&collate->mempool, - name, namelen); + newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool, + name, namelen); if (mbs != NULL) newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen); else @@ -207,8 +215,9 @@ new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen, } else newp->wcs = NULL; - newp->mborder = 0; + newp->mborder = NULL; newp->wcorder = 0; + newp->used_in_level = 0; /* Will be allocated later. */ newp->weights = NULL; @@ -477,7 +486,7 @@ find_element (struct linereader *ldfile, struct locale_collate_t *collate, else if (find_entry (&collate->elem_table, str, len, (void **) &result) != 0) { - /* It's also no collation element. So it is an character + /* It's also no collation element. So it is a character element defined later. */ result = new_element (collate, NULL, 0, NULL, str, len); if (result != NULL) @@ -493,11 +502,20 @@ find_element (struct linereader *ldfile, struct locale_collate_t *collate, static void unlink_element (struct locale_collate_t *collate) { - if (collate->cursor->next != NULL) - collate->cursor->next->last = collate->cursor->last; - if (collate->cursor->last != NULL) - collate->cursor->last->next = collate->cursor->next; - collate->cursor = collate->cursor->last; + if (collate->cursor == collate->start) + { + assert (collate->cursor->next == NULL); + assert (collate->cursor->last == NULL); + collate->cursor = NULL; + } + else + { + if (collate->cursor->next != NULL) + collate->cursor->next->last = collate->cursor->last; + if (collate->cursor->last != NULL) + collate->cursor->last->next = collate->cursor->next; + collate->cursor = collate->cursor->last; + } } @@ -516,6 +534,11 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, elem->next = collate->cursor ? collate->cursor->next : NULL; if (collate->cursor != NULL) collate->cursor->next = elem; + if (collate->start == NULL) + { + assert (collate->cursor == NULL); + collate->start = elem; + } elem->weights = (struct element_list_t *) obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t)); memset (elem->weights, '\0', nrules * sizeof (struct element_list_t)); @@ -566,7 +589,8 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, const char *cp = arg->val.str.startmb; int cnt = 0; struct element_t *charelem; - void *base = obstack_base (&collate->mempool); + struct element_t **weights = NULL; + int max = 0; if (*cp == '\0') { @@ -581,18 +605,17 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, if (*cp == '<') { /* Ahh, it's a bsymbol. That's what we want. */ - const char *startp = cp; + const char *startp = ++cp; - while (*++cp != '>') + while (*cp != '>') { if (*cp == ldfile->escape_char) ++cp; if (*cp == '\0') - { - /* It's a syntax error. */ - obstack_free (&collate->mempool, base); - goto syntax; - } + /* It's a syntax error. */ + goto syntax; + + ++cp; } charelem = find_element (ldfile, collate, startp, @@ -606,7 +629,7 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, what this means. We interpret all characters in the string as if that would be bsymbols. Otherwise we would have to match back to bsymbols somehow and this - is also not what people normally expect. */ + is normally not what people normally expect. */ charelem = find_element (ldfile, collate, cp++, 1, NULL); } @@ -618,14 +641,25 @@ insert_weights (struct linereader *ldfile, struct element_t *elem, } /* Add the pointer. */ - obstack_ptr_grow (&collate->mempool, charelem); - ++cnt; + if (cnt >= max) + { + struct element_t **newp; + max += 10; + newp = (struct element_t **) + alloca (max * sizeof (struct element_t *)); + memcpy (newp, weights, cnt * sizeof (struct element_t *)); + weights = newp; + } + weights[cnt++] = charelem; } while (*cp != '\0'); /* Now store the information. */ elem->weights[weight_cnt].w = (struct element_t **) - obstack_finish (&collate->mempool); + obstack_alloc (&collate->mempool, + cnt * sizeof (struct element_t *)); + memcpy (elem->weights[weight_cnt].w, weights, + cnt * sizeof (struct element_t *)); elem->weights[weight_cnt].cnt = cnt; /* We don't need the string anymore. */ @@ -946,10 +980,20 @@ order for `%.*s' already defined at %s:%zu"), /* Enqueue the new element. */ elem->last = collate->cursor; - elem->next = collate->cursor->next; - elem->last->next = elem; - if (elem->next != NULL) - elem->next->last = elem; + if (collate->cursor != NULL) + elem->next = NULL; + else + { + elem->next = collate->cursor->next; + elem->last->next = elem; + if (elem->next != NULL) + elem->next->last = elem; + } + if (collate->start == NULL) + { + assert (collate->cursor == NULL); + collate->start = elem; + } collate->cursor = elem; /* Add the weight value. We take them from the @@ -1232,10 +1276,69 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap) The multibyte case is easy. We simply sort into an array with 256 elements. */ struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate; - int mbact = 2; - int wcact = 2; - struct element_t *runp = collate->start; + int mbact[nrules]; + int wcact; + struct element_t *runp; + int i; + int need_undefined = 0; + /* If this assertion is hit change the type in `element_t'. */ + assert (nrules <= sizeof (runp->used_in_level) * 8); + + /* Find out which elements are used at which level. At the same + time we find out whether we have any undefined symbols. */ + runp = collate->start; + while (runp != NULL) + { + if (runp->mbs != NULL) + { + for (i = 0; i < nrules; ++i) + { + int j; + + for (j = 0; j < runp->weights[i].cnt; ++j) + /* A NULL pointer as the weight means IGNORE. */ + if (runp->weights[i].w[j] != NULL) + { + if (runp->weights[i].w[j]->weights == NULL) + { + error_at_line (0, 0, runp->file, runp->line, + _("symbol `%s' not defined"), + runp->weights[i].w[j]->name); + + need_undefined = 1; + runp->weights[i].w[j] = &collate->undefined; + } + else + /* Set the bit for the level. */ + runp->weights[i].w[j]->used_in_level |= 1 << i; + } + } + } + + /* Up to the next entry. */ + runp = runp->next; + } + + /* Walk through the list of defined sequences and assign weights. Also + create the data structure which will allow generating the single byte + character based tables. + + Since at each time only the weights for each of the rules are + only compared to other weights for this rule it is possible to + assign more compact weight values than simply counting all + weights in sequence. We can assign weights from 2 one for each + rule individually and only for those elements, which are actually + used for this rule. + + Why is this important? It is not for the wide char table. But + it is for the singlebyte output since here larger numbers have to + be encoded to make it possible to emit the value as a byte + string. */ + for (i = 0; i < nrules; ++i) + mbact[i] = 2; + wcact = 2; + runp = collate->start; while (runp != NULL) { if (runp->mbs != NULL) @@ -1243,10 +1346,20 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap) struct element_t **eptr; /* Determine the order. */ - runp->mborder = mbact++; + if (runp->used_in_level != 0) + { + runp->mborder = (int *) obstack_alloc (&collate->mempool, + nrules * sizeof (int)); + + for (i = 0; i < nrules; ++i) + if ((runp->used_in_level & (1 << i)) != 0) + runp->mborder[i] = mbact[i]++; + else + runp->mborder[i] = 0; + } /* Find the point where to insert in the list. */ - eptr = &collate->mbheads[(unsigned int) runp->mbs[0]]; + eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]]; while (*eptr != NULL) { /* Check which string is larger, the one we want to insert @@ -1269,6 +1382,31 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap) /* Up to the next entry. */ runp = runp->next; } + + /* Find out whether any of the `mbheads' entries is unset. In this + case we use the UNDEFINED entry. */ + for (i = 1; i < 256; ++i) + if (collate->mbheads[i] == NULL) + { + need_undefined = 1; + collate->mbheads[i] = &collate->undefined; + } + + /* Now determine whether the UNDEFINED entry is needed and if yes, + whether it was defined. */ + if (need_undefined && collate->undefined.file == NULL) + { + error (0, 0, _("no definition of `UNDEFINED'")); + + /* Add UNDEFINED at the end. */ + collate->undefined.mborder = + (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int)); + + for (i = 0; i < nrules; ++i) + collate->undefined.mborder[i] = mbact[i]++; + + collate->undefined.wcorder = wcact++; + } } @@ -1276,6 +1414,33 @@ void collate_output (struct localedef_t *locale, struct charmap_t *charmap, const char *output_path) { + struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate; + const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE); + struct iovec iov[2 + nelems]; + struct locale_file data; + uint32_t idx[nelems]; + size_t cnt; + + data.magic = LIMAGIC (LC_COLLATE); + data.n = nelems; + iov[0].iov_base = (void *) &data; + iov[0].iov_len = sizeof (data); + + iov[1].iov_base = (void *) idx; + iov[1].iov_len = sizeof (idx); + + idx[0] = iov[0].iov_len + iov[1].iov_len; + cnt = 0; + + assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES)); + iov[2 + cnt].iov_base = &collate->nrules; + iov[2 + cnt].iov_len = sizeof (uint32_t); + idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len; + ++cnt; + + assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE)); + + write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov); } diff --git a/locale/programs/ld-ctype.c b/locale/programs/ld-ctype.c index 5dcb01360f..86d086021d 100644 --- a/locale/programs/ld-ctype.c +++ b/locale/programs/ld-ctype.c @@ -3071,11 +3071,14 @@ Computing table size for character classes might take a while..."), ctype->map32[idx][idx2] = ctype->map_collection[idx][idx2]; while (idx2 < ctype->map_collection_act[idx]) - if (ctype->map_collection[idx][idx2] != 0) - *find_idx (ctype, &ctype->map32[idx], - &ctype->map_collection_max[idx], - &ctype->map_collection_act[idx], - ctype->names[idx2]) = ctype->map_collection[idx][idx2]; + { + if (ctype->map_collection[idx][idx2] != 0) + *find_idx (ctype, &ctype->map32[idx], + &ctype->map_collection_max[idx], + &ctype->map_collection_act[idx], + ctype->names[idx2]) = ctype->map_collection[idx][idx2]; + ++idx2; + } } /* Extra array for class and map names. */ diff --git a/locale/programs/locale-spec.c b/locale/programs/locale-spec.c index 368306c1c6..048dead683 100644 --- a/locale/programs/locale-spec.c +++ b/locale/programs/locale-spec.c @@ -36,6 +36,7 @@ void locale_special (const char *name, int show_category_name, int show_keyword_name) { +#if 0 /* "collate-elements": print collation elements of locale. */ if (strcmp (name, "collate-elements") == 0) { @@ -59,7 +60,6 @@ locale_special (const char *name, int show_category_name, printf ("%s<%s>", first ? "" : ";", &__collate_element_strings[idx]); -#if 0 /* We don't print the string. This is only confusing because only the programs have to know the encoding. The code is left in place because it @@ -85,7 +85,6 @@ locale_special (const char *name, int show_category_name, putchar ('"'); } -#endif first = 0; } } @@ -125,4 +124,5 @@ locale_special (const char *name, int show_category_name, putchar ('\n'); return; } +#endif }