summaryrefslogtreecommitdiffstats
path: root/AppPkg/Applications/Python/Python-2.7.2/Modules/unicodedata.c
diff options
context:
space:
mode:
Diffstat (limited to 'AppPkg/Applications/Python/Python-2.7.2/Modules/unicodedata.c')
-rw-r--r--AppPkg/Applications/Python/Python-2.7.2/Modules/unicodedata.c1280
1 files changed, 1280 insertions, 0 deletions
diff --git a/AppPkg/Applications/Python/Python-2.7.2/Modules/unicodedata.c b/AppPkg/Applications/Python/Python-2.7.2/Modules/unicodedata.c
new file mode 100644
index 0000000000..501e050e0f
--- /dev/null
+++ b/AppPkg/Applications/Python/Python-2.7.2/Modules/unicodedata.c
@@ -0,0 +1,1280 @@
+/* ------------------------------------------------------------------------
+
+ unicodedata -- Provides access to the Unicode 5.2 data base.
+
+ Data was extracted from the Unicode 5.2 UnicodeData.txt file.
+
+ Written by Marc-Andre Lemburg (mal@lemburg.com).
+ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
+ Modified by Martin v. Löwis (martin@v.loewis.de)
+
+ Copyright (c) Corporation for National Research Initiatives.
+
+ ------------------------------------------------------------------------ */
+
+#include "Python.h"
+#include "ucnhash.h"
+#include "structmember.h"
+
+/* character properties */
+
+typedef struct {
+ const unsigned char category; /* index into
+ _PyUnicode_CategoryNames */
+ const unsigned char combining; /* combining class value 0 - 255 */
+ const unsigned char bidirectional; /* index into
+ _PyUnicode_BidirectionalNames */
+ const unsigned char mirrored; /* true if mirrored in bidir mode */
+ const unsigned char east_asian_width; /* index into
+ _PyUnicode_EastAsianWidth */
+ const unsigned char normalization_quick_check; /* see is_normalized() */
+} _PyUnicode_DatabaseRecord;
+
+typedef struct change_record {
+ /* sequence of fields should be the same as in merge_old_version */
+ const unsigned char bidir_changed;
+ const unsigned char category_changed;
+ const unsigned char decimal_changed;
+ const unsigned char mirrored_changed;
+ const double numeric_changed;
+} change_record;
+
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodedata_db.h"
+
+static const _PyUnicode_DatabaseRecord*
+_getrecord_ex(Py_UCS4 code)
+{
+ int index;
+ if (code >= 0x110000)
+ index = 0;
+ else {
+ index = index1[(code>>SHIFT)];
+ index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
+ }
+
+ return &_PyUnicode_Database_Records[index];
+}
+
+/* ------------- Previous-version API ------------------------------------- */
+typedef struct previous_version {
+ PyObject_HEAD
+ const char *name;
+ const change_record* (*getrecord)(Py_UCS4);
+ Py_UCS4 (*normalization)(Py_UCS4);
+} PreviousDBVersion;
+
+#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
+
+static PyMemberDef DB_members[] = {
+ {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
+ {NULL}
+};
+
+/* forward declaration */
+static PyTypeObject UCD_Type;
+
+static PyObject*
+new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
+ Py_UCS4 (*normalization)(Py_UCS4))
+{
+ PreviousDBVersion *self;
+ self = PyObject_New(PreviousDBVersion, &UCD_Type);
+ if (self == NULL)
+ return NULL;
+ self->name = name;
+ self->getrecord = getrecord;
+ self->normalization = normalization;
+ return (PyObject*)self;
+}
+
+
+static Py_UCS4 getuchar(PyUnicodeObject *obj)
+{
+ Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
+
+ if (PyUnicode_GET_SIZE(obj) == 1)
+ return *v;
+#ifndef Py_UNICODE_WIDE
+ else if ((PyUnicode_GET_SIZE(obj) == 2) &&
+ (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
+ (0xDC00 <= v[1] && v[1] <= 0xDFFF))
+ return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
+#endif
+ PyErr_SetString(PyExc_TypeError,
+ "need a single Unicode character as parameter");
+ return (Py_UCS4)-1;
+}
+
+/* --- Module API --------------------------------------------------------- */
+
+PyDoc_STRVAR(unicodedata_decimal__doc__,
+"decimal(unichr[, default])\n\
+\n\
+Returns the decimal value assigned to the Unicode character unichr\n\
+as integer. If no such value is defined, default is returned, or, if\n\
+not given, ValueError is raised.");
+
+static PyObject *
+unicodedata_decimal(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ PyObject *defobj = NULL;
+ int have_old = 0;
+ long rc;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0) {
+ /* unassigned */
+ have_old = 1;
+ rc = -1;
+ }
+ else if (old->decimal_changed != 0xFF) {
+ have_old = 1;
+ rc = old->decimal_changed;
+ }
+ }
+
+ if (!have_old)
+ rc = Py_UNICODE_TODECIMAL(c);
+ if (rc < 0) {
+ if (defobj == NULL) {
+ PyErr_SetString(PyExc_ValueError,
+ "not a decimal");
+ return NULL;
+ }
+ else {
+ Py_INCREF(defobj);
+ return defobj;
+ }
+ }
+ return PyInt_FromLong(rc);
+}
+
+PyDoc_STRVAR(unicodedata_digit__doc__,
+"digit(unichr[, default])\n\
+\n\
+Returns the digit value assigned to the Unicode character unichr as\n\
+integer. If no such value is defined, default is returned, or, if\n\
+not given, ValueError is raised.");
+
+static PyObject *
+unicodedata_digit(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ PyObject *defobj = NULL;
+ long rc;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+ rc = Py_UNICODE_TODIGIT(c);
+ if (rc < 0) {
+ if (defobj == NULL) {
+ PyErr_SetString(PyExc_ValueError, "not a digit");
+ return NULL;
+ }
+ else {
+ Py_INCREF(defobj);
+ return defobj;
+ }
+ }
+ return PyInt_FromLong(rc);
+}
+
+PyDoc_STRVAR(unicodedata_numeric__doc__,
+"numeric(unichr[, default])\n\
+\n\
+Returns the numeric value assigned to the Unicode character unichr\n\
+as float. If no such value is defined, default is returned, or, if\n\
+not given, ValueError is raised.");
+
+static PyObject *
+unicodedata_numeric(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ PyObject *defobj = NULL;
+ int have_old = 0;
+ double rc;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0) {
+ /* unassigned */
+ have_old = 1;
+ rc = -1.0;
+ }
+ else if (old->decimal_changed != 0xFF) {
+ have_old = 1;
+ rc = old->decimal_changed;
+ }
+ }
+
+ if (!have_old)
+ rc = Py_UNICODE_TONUMERIC(c);
+ if (rc == -1.0) {
+ if (defobj == NULL) {
+ PyErr_SetString(PyExc_ValueError, "not a numeric character");
+ return NULL;
+ }
+ else {
+ Py_INCREF(defobj);
+ return defobj;
+ }
+ }
+ return PyFloat_FromDouble(rc);
+}
+
+PyDoc_STRVAR(unicodedata_category__doc__,
+"category(unichr)\n\
+\n\
+Returns the general category assigned to the Unicode character\n\
+unichr as string.");
+
+static PyObject *
+unicodedata_category(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ int index;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!:category",
+ &PyUnicode_Type, &v))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+ index = (int) _getrecord_ex(c)->category;
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed != 0xFF)
+ index = old->category_changed;
+ }
+ return PyString_FromString(_PyUnicode_CategoryNames[index]);
+}
+
+PyDoc_STRVAR(unicodedata_bidirectional__doc__,
+"bidirectional(unichr)\n\
+\n\
+Returns the bidirectional category assigned to the Unicode character\n\
+unichr as string. If no such value is defined, an empty string is\n\
+returned.");
+
+static PyObject *
+unicodedata_bidirectional(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ int index;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!:bidirectional",
+ &PyUnicode_Type, &v))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+ index = (int) _getrecord_ex(c)->bidirectional;
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ else if (old->bidir_changed != 0xFF)
+ index = old->bidir_changed;
+ }
+ return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
+}
+
+PyDoc_STRVAR(unicodedata_combining__doc__,
+"combining(unichr)\n\
+\n\
+Returns the canonical combining class assigned to the Unicode\n\
+character unichr as integer. Returns 0 if no combining class is\n\
+defined.");
+
+static PyObject *
+unicodedata_combining(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ int index;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!:combining",
+ &PyUnicode_Type, &v))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+ index = (int) _getrecord_ex(c)->combining;
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ }
+ return PyInt_FromLong(index);
+}
+
+PyDoc_STRVAR(unicodedata_mirrored__doc__,
+"mirrored(unichr)\n\
+\n\
+Returns the mirrored property assigned to the Unicode character\n\
+unichr as integer. Returns 1 if the character has been identified as\n\
+a \"mirrored\" character in bidirectional text, 0 otherwise.");
+
+static PyObject *
+unicodedata_mirrored(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ int index;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!:mirrored",
+ &PyUnicode_Type, &v))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+ index = (int) _getrecord_ex(c)->mirrored;
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ else if (old->mirrored_changed != 0xFF)
+ index = old->mirrored_changed;
+ }
+ return PyInt_FromLong(index);
+}
+
+PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
+"east_asian_width(unichr)\n\
+\n\
+Returns the east asian width assigned to the Unicode character\n\
+unichr as string.");
+
+static PyObject *
+unicodedata_east_asian_width(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ int index;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!:east_asian_width",
+ &PyUnicode_Type, &v))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+ index = (int) _getrecord_ex(c)->east_asian_width;
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0)
+ index = 0; /* unassigned */
+ }
+ return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
+}
+
+PyDoc_STRVAR(unicodedata_decomposition__doc__,
+"decomposition(unichr)\n\
+\n\
+Returns the character decomposition mapping assigned to the Unicode\n\
+character unichr as string. An empty string is returned in case no\n\
+such mapping is defined.");
+
+static PyObject *
+unicodedata_decomposition(PyObject *self, PyObject *args)
+{
+ PyUnicodeObject *v;
+ char decomp[256];
+ int code, index, count, i;
+ unsigned int prefix_index;
+ Py_UCS4 c;
+
+ if (!PyArg_ParseTuple(args, "O!:decomposition",
+ &PyUnicode_Type, &v))
+ return NULL;
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+
+ code = (int)c;
+
+ if (self) {
+ const change_record *old = get_old_record(self, c);
+ if (old->category_changed == 0)
+ return PyString_FromString(""); /* unassigned */
+ }
+
+ if (code < 0 || code >= 0x110000)
+ index = 0;
+ else {
+ index = decomp_index1[(code>>DECOMP_SHIFT)];
+ index = decomp_index2[(index<<DECOMP_SHIFT)+
+ (code&((1<<DECOMP_SHIFT)-1))];
+ }
+
+ /* high byte is number of hex bytes (usually one or two), low byte
+ is prefix code (from*/
+ count = decomp_data[index] >> 8;
+
+ /* XXX: could allocate the PyString up front instead
+ (strlen(prefix) + 5 * count + 1 bytes) */
+
+ /* Based on how index is calculated above and decomp_data is generated
+ from Tools/unicode/makeunicodedata.py, it should not be possible
+ to overflow decomp_prefix. */
+ prefix_index = decomp_data[index] & 255;
+ assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
+
+ /* copy prefix */
+ i = strlen(decomp_prefix[prefix_index]);
+ memcpy(decomp, decomp_prefix[prefix_index], i);
+
+ while (count-- > 0) {
+ if (i)
+ decomp[i++] = ' ';
+ assert((size_t)i < sizeof(decomp));
+ PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
+ decomp_data[++index]);
+ i += strlen(decomp + i);
+ }
+
+ decomp[i] = '\0';
+
+ return PyString_FromString(decomp);
+}
+
+static void
+get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
+{
+ if (code >= 0x110000) {
+ *index = 0;
+ } else if (self && get_old_record(self, code)->category_changed==0) {
+ /* unassigned in old version */
+ *index = 0;
+ }
+ else {
+ *index = decomp_index1[(code>>DECOMP_SHIFT)];
+ *index = decomp_index2[(*index<<DECOMP_SHIFT)+
+ (code&((1<<DECOMP_SHIFT)-1))];
+ }
+
+ /* high byte is number of hex bytes (usually one or two), low byte
+ is prefix code (from*/
+ *count = decomp_data[*index] >> 8;
+ *prefix = decomp_data[*index] & 255;
+
+ (*index)++;
+}
+
+#define SBase 0xAC00
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11A7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define NCount (VCount*TCount)
+#define SCount (LCount*NCount)
+
+static PyObject*
+nfd_nfkd(PyObject *self, PyObject *input, int k)
+{
+ PyObject *result;
+ Py_UNICODE *i, *end, *o;
+ /* Longest decomposition in Unicode 3.2: U+FDFA */
+ Py_UNICODE stack[20];
+ Py_ssize_t space, isize;
+ int index, prefix, count, stackptr;
+ unsigned char prev, cur;
+
+ stackptr = 0;
+ isize = PyUnicode_GET_SIZE(input);
+ /* Overallocate atmost 10 characters. */
+ space = (isize > 10 ? 10 : isize) + isize;
+ result = PyUnicode_FromUnicode(NULL, space);
+ if (!result)
+ return NULL;
+ i = PyUnicode_AS_UNICODE(input);
+ end = i + isize;
+ o = PyUnicode_AS_UNICODE(result);
+
+ while (i < end) {
+ stack[stackptr++] = *i++;
+ while(stackptr) {
+ Py_UNICODE code = stack[--stackptr];
+ /* Hangul Decomposition adds three characters in
+ a single step, so we need atleast that much room. */
+ if (space < 3) {
+ Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
+ space += 10;
+ if (PyUnicode_Resize(&result, newsize) == -1)
+ return NULL;
+ o = PyUnicode_AS_UNICODE(result) + newsize - space;
+ }
+ /* Hangul Decomposition. */
+ if (SBase <= code && code < (SBase+SCount)) {
+ int SIndex = code - SBase;
+ int L = LBase + SIndex / NCount;
+ int V = VBase + (SIndex % NCount) / TCount;
+ int T = TBase + SIndex % TCount;
+ *o++ = L;
+ *o++ = V;
+ space -= 2;
+ if (T != TBase) {
+ *o++ = T;
+ space --;
+ }
+ continue;
+ }
+ /* normalization changes */
+ if (self) {
+ Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
+ if (value != 0) {
+ stack[stackptr++] = value;
+ continue;
+ }
+ }
+
+ /* Other decompositions. */
+ get_decomp_record(self, code, &index, &prefix, &count);
+
+ /* Copy character if it is not decomposable, or has a
+ compatibility decomposition, but we do NFD. */
+ if (!count || (prefix && !k)) {
+ *o++ = code;
+ space--;
+ continue;
+ }
+ /* Copy decomposition onto the stack, in reverse
+ order. */
+ while(count) {
+ code = decomp_data[index + (--count)];
+ stack[stackptr++] = code;
+ }
+ }
+ }
+
+ /* Drop overallocation. Cannot fail. */
+ PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
+
+ /* Sort canonically. */
+ i = PyUnicode_AS_UNICODE(result);
+ prev = _getrecord_ex(*i)->combining;
+ end = i + PyUnicode_GET_SIZE(result);
+ for (i++; i < end; i++) {
+ cur = _getrecord_ex(*i)->combining;
+ if (prev == 0 || cur == 0 || prev <= cur) {
+ prev = cur;
+ continue;
+ }
+ /* Non-canonical order. Need to switch *i with previous. */
+ o = i - 1;
+ while (1) {
+ Py_UNICODE tmp = o[1];
+ o[1] = o[0];
+ o[0] = tmp;
+ o--;
+ if (o < PyUnicode_AS_UNICODE(result))
+ break;
+ prev = _getrecord_ex(*o)->combining;
+ if (prev == 0 || prev <= cur)
+ break;
+ }
+ prev = _getrecord_ex(*i)->combining;
+ }
+ return result;
+}
+
+static int
+find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
+{
+ int index;
+ for (index = 0; nfc[index].start; index++) {
+ int start = nfc[index].start;
+ if (code < start)
+ return -1;
+ if (code <= start + nfc[index].count) {
+ int delta = code - start;
+ return nfc[index].index + delta;
+ }
+ }
+ return -1;
+}
+
+static PyObject*
+nfc_nfkc(PyObject *self, PyObject *input, int k)
+{
+ PyObject *result;
+ Py_UNICODE *i, *i1, *o, *end;
+ int f,l,index,index1,comb;
+ Py_UNICODE code;
+ Py_UNICODE *skipped[20];
+ int cskipped = 0;
+
+ result = nfd_nfkd(self, input, k);
+ if (!result)
+ return NULL;
+
+ /* We are going to modify result in-place.
+ If nfd_nfkd is changed to sometimes return the input,
+ this code needs to be reviewed. */
+ assert(result != input);
+
+ i = PyUnicode_AS_UNICODE(result);
+ end = i + PyUnicode_GET_SIZE(result);
+ o = PyUnicode_AS_UNICODE(result);
+
+ again:
+ while (i < end) {
+ for (index = 0; index < cskipped; index++) {
+ if (skipped[index] == i) {
+ /* *i character is skipped.
+ Remove from list. */
+ skipped[index] = skipped[cskipped-1];
+ cskipped--;
+ i++;
+ goto again; /* continue while */
+ }
+ }
+ /* Hangul Composition. We don't need to check for <LV,T>
+ pairs, since we always have decomposed data. */
+ if (LBase <= *i && *i < (LBase+LCount) &&
+ i + 1 < end &&
+ VBase <= i[1] && i[1] <= (VBase+VCount)) {
+ int LIndex, VIndex;
+ LIndex = i[0] - LBase;
+ VIndex = i[1] - VBase;
+ code = SBase + (LIndex*VCount+VIndex)*TCount;
+ i+=2;
+ if (i < end &&
+ TBase <= *i && *i <= (TBase+TCount)) {
+ code += *i-TBase;
+ i++;
+ }
+ *o++ = code;
+ continue;
+ }
+
+ f = find_nfc_index(self, nfc_first, *i);
+ if (f == -1) {
+ *o++ = *i++;
+ continue;
+ }
+ /* Find next unblocked character. */
+ i1 = i+1;
+ comb = 0;
+ while (i1 < end) {
+ int comb1 = _getrecord_ex(*i1)->combining;
+ if (comb) {
+ if (comb1 == 0)
+ break;
+ if (comb >= comb1) {
+ /* Character is blocked. */
+ i1++;
+ continue;
+ }
+ }
+ l = find_nfc_index(self, nfc_last, *i1);
+ /* *i1 cannot be combined with *i. If *i1
+ is a starter, we don't need to look further.
+ Otherwise, record the combining class. */
+ if (l == -1) {
+ not_combinable:
+ if (comb1 == 0)
+ break;
+ comb = comb1;
+ i1++;
+ continue;
+ }
+ index = f*TOTAL_LAST + l;
+ index1 = comp_index[index >> COMP_SHIFT];
+ code = comp_data[(index1<<COMP_SHIFT)+
+ (index&((1<<COMP_SHIFT)-1))];
+ if (code == 0)
+ goto not_combinable;
+
+ /* Replace the original character. */
+ *i = code;
+ /* Mark the second character unused. */
+ assert(cskipped < 20);
+ skipped[cskipped++] = i1;
+ i1++;
+ f = find_nfc_index(self, nfc_first, *i);
+ if (f == -1)
+ break;
+ }
+ *o++ = *i++;
+ }
+ if (o != end)
+ PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
+ return result;
+}
+
+/* Return 1 if the input is certainly normalized, 0 if it might not be. */
+static int
+is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+{
+ Py_UNICODE *i, *end;
+ unsigned char prev_combining = 0, quickcheck_mask;
+
+ /* An older version of the database is requested, quickchecks must be
+ disabled. */
+ if (self != NULL)
+ return 0;
+
+ /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
+ as described in http://unicode.org/reports/tr15/#Annex8. */
+ quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+
+ i = PyUnicode_AS_UNICODE(input);
+ end = i + PyUnicode_GET_SIZE(input);
+ while (i < end) {
+ const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
+ unsigned char combining = record->combining;
+ unsigned char quickcheck = record->normalization_quick_check;
+
+ if (quickcheck & quickcheck_mask)
+ return 0; /* this string might need normalization */
+ if (combining && prev_combining > combining)
+ return 0; /* non-canonical sort order, not normalized */
+ prev_combining = combining;
+ }
+ return 1; /* certainly normalized */
+}
+
+PyDoc_STRVAR(unicodedata_normalize__doc__,
+"normalize(form, unistr)\n\
+\n\
+Return the normal form 'form' for the Unicode string unistr. Valid\n\
+values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
+
+static PyObject*
+unicodedata_normalize(PyObject *self, PyObject *args)
+{
+ char *form;
+ PyObject *input;
+
+ if(!PyArg_ParseTuple(args, "sO!:normalize",
+ &form, &PyUnicode_Type, &input))
+ return NULL;
+
+ if (PyUnicode_GetSize(input) == 0) {
+ /* Special case empty input strings, since resizing
+ them later would cause internal errors. */
+ Py_INCREF(input);
+ return input;
+ }
+
+ if (strcmp(form, "NFC") == 0) {
+ if (is_normalized(self, input, 1, 0)) {
+ Py_INCREF(input);
+ return input;
+ }
+ return nfc_nfkc(self, input, 0);
+ }
+ if (strcmp(form, "NFKC") == 0) {
+ if (is_normalized(self, input, 1, 1)) {
+ Py_INCREF(input);
+ return input;
+ }
+ return nfc_nfkc(self, input, 1);
+ }
+ if (strcmp(form, "NFD") == 0) {
+ if (is_normalized(self, input, 0, 0)) {
+ Py_INCREF(input);
+ return input;
+ }
+ return nfd_nfkd(self, input, 0);
+ }
+ if (strcmp(form, "NFKD") == 0) {
+ if (is_normalized(self, input, 0, 1)) {
+ Py_INCREF(input);
+ return input;
+ }
+ return nfd_nfkd(self, input, 1);
+ }
+ PyErr_SetString(PyExc_ValueError, "invalid normalization form");
+ return NULL;
+}
+
+/* -------------------------------------------------------------------- */
+/* unicode character name tables */
+
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodename_db.h"
+
+/* -------------------------------------------------------------------- */
+/* database code (cut and pasted from the unidb package) */
+
+static unsigned long
+_gethash(const char *s, int len, int scale)
+{
+ int i;
+ unsigned long h = 0;
+ unsigned long ix;
+ for (i = 0; i < len; i++) {
+ h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
+ ix = h & 0xff000000;
+ if (ix)
+ h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
+ }
+ return h;
+}
+
+static char *hangul_syllables[][3] = {
+ { "G", "A", "" },
+ { "GG", "AE", "G" },
+ { "N", "YA", "GG" },
+ { "D", "YAE", "GS" },
+ { "DD", "EO", "N", },
+ { "R", "E", "NJ" },
+ { "M", "YEO", "NH" },
+ { "B", "YE", "D" },
+ { "BB", "O", "L" },
+ { "S", "WA", "LG" },
+ { "SS", "WAE", "LM" },
+ { "", "OE", "LB" },
+ { "J", "YO", "LS" },
+ { "JJ", "U", "LT" },
+ { "C", "WEO", "LP" },
+ { "K", "WE", "LH" },
+ { "T", "WI", "M" },
+ { "P", "YU", "B" },
+ { "H", "EU", "BS" },
+ { 0, "YI", "S" },
+ { 0, "I", "SS" },
+ { 0, 0, "NG" },
+ { 0, 0, "J" },
+ { 0, 0, "C" },
+ { 0, 0, "K" },
+ { 0, 0, "T" },
+ { 0, 0, "P" },
+ { 0, 0, "H" }
+};
+
+static int
+is_unified_ideograph(Py_UCS4 code)
+{
+ return (
+ (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
+ (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
+ (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
+ (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */
+}
+
+static int
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
+{
+ int offset;
+ int i;
+ int word;
+ unsigned char* w;
+
+ if (code >= 0x110000)
+ return 0;
+
+ if (self) {
+ const change_record *old = get_old_record(self, code);
+ if (old->category_changed == 0) {
+ /* unassigned */
+ return 0;
+ }
+ }
+
+ if (SBase <= code && code < SBase+SCount) {
+ /* Hangul syllable. */
+ int SIndex = code - SBase;
+ int L = SIndex / NCount;
+ int V = (SIndex % NCount) / TCount;
+ int T = SIndex % TCount;
+
+ if (buflen < 27)
+ /* Worst case: HANGUL SYLLABLE <10chars>. */
+ return 0;
+ strcpy(buffer, "HANGUL SYLLABLE ");
+ buffer += 16;
+ strcpy(buffer, hangul_syllables[L][0]);
+ buffer += strlen(hangul_syllables[L][0]);
+ strcpy(buffer, hangul_syllables[V][1]);
+ buffer += strlen(hangul_syllables[V][1]);
+ strcpy(buffer, hangul_syllables[T][2]);
+ buffer += strlen(hangul_syllables[T][2]);
+ *buffer = '\0';
+ return 1;
+ }
+
+ if (is_unified_ideograph(code)) {
+ if (buflen < 28)
+ /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
+ return 0;
+ sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
+ return 1;
+ }
+
+ /* get offset into phrasebook */
+ offset = phrasebook_offset1[(code>>phrasebook_shift)];
+ offset = phrasebook_offset2[(offset<<phrasebook_shift) +
+ (code&((1<<phrasebook_shift)-1))];
+ if (!offset)
+ return 0;
+
+ i = 0;
+
+ for (;;) {
+ /* get word index */
+ word = phrasebook[offset] - phrasebook_short;
+ if (word >= 0) {
+ word = (word << 8) + phrasebook[offset+1];
+ offset += 2;
+ } else
+ word = phrasebook[offset++];
+ if (i) {
+ if (i > buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = ' ';
+ }
+ /* copy word string from lexicon. the last character in the
+ word has bit 7 set. the last word in a string ends with
+ 0x80 */
+ w = lexicon + lexicon_offset[word];
+ while (*w < 128) {
+ if (i >= buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = *w++;
+ }
+ if (i >= buflen)
+ return 0; /* buffer overflow */
+ buffer[i++] = *w & 127;
+ if (*w == 128)
+ break; /* end of word */
+ }
+
+ return 1;
+}
+
+static int
+_cmpname(PyObject *self, int code, const char* name, int namelen)
+{
+ /* check if code corresponds to the given name */
+ int i;
+ char buffer[NAME_MAXLEN];
+ if (!_getucname(self, code, buffer, sizeof(buffer)))
+ return 0;
+ for (i = 0; i < namelen; i++) {
+ if (toupper(Py_CHARMASK(name[i])) != buffer[i])
+ return 0;
+ }
+ return buffer[namelen] == '\0';
+}
+
+static void
+find_syllable(const char *str, int *len, int *pos, int count, int column)
+{
+ int i, len1;
+ *len = -1;
+ for (i = 0; i < count; i++) {
+ char *s = hangul_syllables[i][column];
+ len1 = strlen(s);
+ if (len1 <= *len)
+ continue;
+ if (strncmp(str, s, len1) == 0) {
+ *len = len1;
+ *pos = i;
+ }
+ }
+ if (*len == -1) {
+ *len = 0;
+ }
+}
+
+static int
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
+{
+ unsigned int h, v;
+ unsigned int mask = code_size-1;
+ unsigned int i, incr;
+
+ /* Check for hangul syllables. */
+ if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
+ int len, L = -1, V = -1, T = -1;
+ const char *pos = name + 16;
+ find_syllable(pos, &len, &L, LCount, 0);
+ pos += len;
+ find_syllable(pos, &len, &V, VCount, 1);
+ pos += len;
+ find_syllable(pos, &len, &T, TCount, 2);
+ pos += len;
+ if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
+ *code = SBase + (L*VCount+V)*TCount + T;
+ return 1;
+ }
+ /* Otherwise, it's an illegal syllable name. */
+ return 0;
+ }
+
+ /* Check for unified ideographs. */
+ if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
+ /* Four or five hexdigits must follow. */
+ v = 0;
+ name += 22;
+ namelen -= 22;
+ if (namelen != 4 && namelen != 5)
+ return 0;
+ while (namelen--) {
+ v *= 16;
+ if (*name >= '0' && *name <= '9')
+ v += *name - '0';
+ else if (*name >= 'A' && *name <= 'F')
+ v += *name - 'A' + 10;
+ else
+ return 0;
+ name++;
+ }
+ if (!is_unified_ideograph(v))
+ return 0;
+ *code = v;
+ return 1;
+ }
+
+ /* the following is the same as python's dictionary lookup, with
+ only minor changes. see the makeunicodedata script for more
+ details */
+
+ h = (unsigned int) _gethash(name, namelen, code_magic);
+ i = (~h) & mask;
+ v = code_hash[i];
+ if (!v)
+ return 0;
+ if (_cmpname(self, v, name, namelen)) {
+ *code = v;
+ return 1;
+ }
+ incr = (h ^ (h >> 3)) & mask;
+ if (!incr)
+ incr = mask;
+ for (;;) {
+ i = (i + incr) & mask;
+ v = code_hash[i];
+ if (!v)
+ return 0;
+ if (_cmpname(self, v, name, namelen)) {
+ *code = v;
+ return 1;
+ }
+ incr = incr << 1;
+ if (incr > mask)
+ incr = incr ^ code_poly;
+ }
+}
+
+static const _PyUnicode_Name_CAPI hashAPI =
+{
+ sizeof(_PyUnicode_Name_CAPI),
+ _getucname,
+ _getcode
+};
+
+/* -------------------------------------------------------------------- */
+/* Python bindings */
+
+PyDoc_STRVAR(unicodedata_name__doc__,
+"name(unichr[, default])\n\
+Returns the name assigned to the Unicode character unichr as a\n\
+string. If no name is defined, default is returned, or, if not\n\
+given, ValueError is raised.");
+
+static PyObject *
+unicodedata_name(PyObject* self, PyObject* args)
+{
+ char name[NAME_MAXLEN];
+ Py_UCS4 c;
+
+ PyUnicodeObject* v;
+ PyObject* defobj = NULL;
+ if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
+ return NULL;
+
+ c = getuchar(v);
+ if (c == (Py_UCS4)-1)
+ return NULL;
+
+ if (!_getucname(self, c, name, sizeof(name))) {
+ if (defobj == NULL) {
+ PyErr_SetString(PyExc_ValueError, "no such name");
+ return NULL;
+ }
+ else {
+ Py_INCREF(defobj);
+ return defobj;
+ }
+ }
+
+ return Py_BuildValue("s", name);
+}
+
+PyDoc_STRVAR(unicodedata_lookup__doc__,
+"lookup(name)\n\
+\n\
+Look up character by name. If a character with the\n\
+given name is found, return the corresponding Unicode\n\
+character. If not found, KeyError is raised.");
+
+static PyObject *
+unicodedata_lookup(PyObject* self, PyObject* args)
+{
+ Py_UCS4 code;
+ Py_UNICODE str[2];
+
+ char* name;
+ int namelen;
+ if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
+ return NULL;
+
+ if (!_getcode(self, name, namelen, &code)) {
+ PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
+ name);
+ return NULL;
+ }
+
+#ifndef Py_UNICODE_WIDE
+ if (code >= 0x10000) {
+ str[0] = 0xd800 + ((code - 0x10000) >> 10);
+ str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
+ return PyUnicode_FromUnicode(str, 2);
+ }
+#endif
+ str[0] = (Py_UNICODE) code;
+ return PyUnicode_FromUnicode(str, 1);
+}
+
+/* XXX Add doc strings. */
+
+static PyMethodDef unicodedata_functions[] = {
+ {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
+ {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
+ {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
+ {"category", unicodedata_category, METH_VARARGS,
+ unicodedata_category__doc__},
+ {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
+ unicodedata_bidirectional__doc__},
+ {"combining", unicodedata_combining, METH_VARARGS,
+ unicodedata_combining__doc__},
+ {"mirrored", unicodedata_mirrored, METH_VARARGS,
+ unicodedata_mirrored__doc__},
+ {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
+ unicodedata_east_asian_width__doc__},
+ {"decomposition", unicodedata_decomposition, METH_VARARGS,
+ unicodedata_decomposition__doc__},
+ {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
+ {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
+ {"normalize", unicodedata_normalize, METH_VARARGS,
+ unicodedata_normalize__doc__},
+ {NULL, NULL} /* sentinel */
+};
+
+static PyTypeObject UCD_Type = {
+ /* The ob_type field must be initialized in the module init function
+ * to be portable to Windows without using C++. */
+ PyVarObject_HEAD_INIT(NULL, 0)
+ "unicodedata.UCD", /*tp_name*/
+ sizeof(PreviousDBVersion), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ /* methods */
+ (destructor)PyObject_Del, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash*/
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ PyObject_GenericGetAttr,/*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ 0, /*tp_doc*/
+ 0, /*tp_traverse*/
+ 0, /*tp_clear*/
+ 0, /*tp_richcompare*/
+ 0, /*tp_weaklistoffset*/
+ 0, /*tp_iter*/
+ 0, /*tp_iternext*/
+ unicodedata_functions, /*tp_methods*/
+ DB_members, /*tp_members*/
+ 0, /*tp_getset*/
+ 0, /*tp_base*/
+ 0, /*tp_dict*/
+ 0, /*tp_descr_get*/
+ 0, /*tp_descr_set*/
+ 0, /*tp_dictoffset*/
+ 0, /*tp_init*/
+ 0, /*tp_alloc*/
+ 0, /*tp_new*/
+ 0, /*tp_free*/
+ 0, /*tp_is_gc*/
+};
+
+PyDoc_STRVAR(unicodedata_docstring,
+"This module provides access to the Unicode Character Database which\n\
+defines character properties for all Unicode characters. The data in\n\
+this database is based on the UnicodeData.txt file version\n\
+5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
+\n\
+The module uses the same names and symbols as defined by the\n\
+UnicodeData File Format 5.2.0 (see\n\
+http://www.unicode.org/reports/tr44/tr44-4.html).");
+
+PyMODINIT_FUNC
+initunicodedata(void)
+{
+ PyObject *m, *v;
+
+ Py_TYPE(&UCD_Type) = &PyType_Type;
+
+ m = Py_InitModule3(
+ "unicodedata", unicodedata_functions, unicodedata_docstring);
+ if (!m)
+ return;
+
+ PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
+ Py_INCREF(&UCD_Type);
+ PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
+
+ /* Previous versions */
+ v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
+ if (v != NULL)
+ PyModule_AddObject(m, "ucd_3_2_0", v);
+
+ /* Export C API */
+ v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
+ if (v != NULL)
+ PyModule_AddObject(m, "ucnhash_CAPI", v);
+}
+
+/*
+Local variables:
+c-basic-offset: 4
+indent-tabs-mode: nil
+End:
+*/