ucommon
unicode.h
Go to the documentation of this file.
1 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks.
2 //
3 // This file is part of GNU uCommon C++.
4 //
5 // GNU uCommon C++ is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU Lesser General Public License as published
7 // by the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // GNU uCommon C++ is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
17 
32 #ifndef _UCOMMON_UNICODE_H_
33 #define _UCOMMON_UNICODE_H_
34 
35 #ifndef _UCOMMON_STRING_H_
36 #include <ucommon/string.h>
37 #endif
38 
39 NAMESPACE_UCOMMON
40 
45 typedef int32_t ucs4_t;
46 
50 typedef int16_t ucs2_t;
51 
55 typedef void *unicode_t;
56 
62 class __EXPORT utf8
63 {
64 public:
68  static const unsigned ucsize;
69 
73  static const char *nil;
74 
80  static unsigned size(const char *codepoint);
81 
87  static size_t count(const char *string);
88 
95  static char *offset(char *string, ssize_t position);
96 
102  static ucs4_t codepoint(const char *encoded);
103 
109  static size_t chars(const unicode_t string);
110 
116  static size_t chars(ucs4_t character);
117 
124  static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
125 
133  static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
134 
138  static ucs4_t *udup(const char *string);
139 
143  static ucs2_t *wdup(const char *string);
144 
152  static const char *find(const char *string, ucs4_t character, size_t start = 0);
153 
161  static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
162 
169  static unsigned ccount(const char *string, ucs4_t character);
170 
176  static ucs4_t get(CharacterProtocol& buffer);
177 
184  static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
185 };
186 
193 class __EXPORT UString : public String, public utf8
194 {
195 protected:
199  UString();
200 
205  UString(strsize_t size);
206 
211  UString(const unicode_t text);
212 
219  UString(const char *text, strsize_t size);
220 
227  UString(const unicode_t *text, const unicode_t *end);
228 
234  UString(const UString& existing);
235 
240  virtual ~UString();
241 
248  UString get(strsize_t codepoint, strsize_t size = 0) const;
249 
256  size_t get(unicode_t unicode, size_t size) const;
257 
262  void set(const unicode_t unicode);
263 
268  void add(const unicode_t unicode);
269 
275  ucs4_t at(int position) const;
276 
283  inline size_t operator()(unicode_t unicode, size_t size) const
284  {return get(unicode, size);};
285 
292  UString operator()(int codepoint, strsize_t size) const;
293 
299  inline UString left(strsize_t size) const
300  {return operator()(0, size);}
301 
307  inline UString right(strsize_t offset) const
308  {return operator()(-((int)offset), 0);}
309 
316  inline UString copy(strsize_t offset, strsize_t size) const
317  {return operator()((int)offset, size);}
318 
324  void cut(strsize_t offset, strsize_t size = 0);
325 
332  void paste(strsize_t offset, const char *text, strsize_t size = 0);
333 
341  const char *operator()(int offset) const;
342 
348  inline ucs4_t operator[](int position) const
349  {return UString::at(position);};
350 
355  inline strsize_t count(void) const
356  {return utf8::count(str->text);}
357 
363  unsigned ccount(ucs4_t character) const;
364 
371  const char *find(ucs4_t character, strsize_t start = 0) const;
372 
379  const char *rfind(ucs4_t character, strsize_t end = npos) const;
380 };
381 
387 class __EXPORT utf8_pointer
388 {
389 protected:
390  uint8_t *text;
391 
392 public:
396  utf8_pointer();
397 
402  utf8_pointer(const char *string);
403 
409 
414  utf8_pointer& operator ++();
415 
420  utf8_pointer& operator --();
421 
427  utf8_pointer& operator +=(long offset);
428 
434  utf8_pointer& operator -=(long offset);
435 
441  utf8_pointer operator+(long offset) const;
442 
448  utf8_pointer operator-(long offset) const;
449 
454  inline operator bool() const
455  {return text != NULL;};
456 
461  inline bool operator!() const
462  {return text == NULL;};
463 
469  ucs4_t operator[](long codepoint) const;
470 
476  utf8_pointer& operator=(const char *string);
477 
481  void inc(void);
482 
486  void dec(void);
487 
493  inline bool operator==(const char *string) const
494  {return (const char *)text == string;};
495 
501  inline bool operator!=(const char *string) const
502  {return (const char *)text != string;};
503 
508  inline ucs4_t operator*() const
509  {return utf8::codepoint((const char *)text);};
510 
515  inline char *c_str(void) const
516  {return (char *)text;};
517 
522  inline operator char*() const
523  {return (char *)text;};
524 
529  inline size_t len(void) const
530  {return utf8::count((const char *)text);};
531 };
532 
533 inline ucs4_t *strudup(const char *string)
534  {return utf8::udup(string);}
535 
536 inline ucs2_t *strwdup(const char *string)
537  {return utf8::wdup(string);}
538 
539 __EXPORT unicode_t unidup(const char *string);
540 
541 template<>
542 inline void dupfree<ucs2_t*>(ucs2_t *string)
543  {::free(string);}
544 
545 template<>
546 inline void dupfree<ucs4_t*>(ucs4_t *string)
547  {::free(string);}
548 
549 template<>
550 inline void dupfree<unicode_t>(unicode_t string)
551  {::free(string);}
552 
557 
562 
563 END_NAMESPACE
564 
565 #endif
bool operator==(const char *string) const
check if pointer equals another string.
Definition: unicode.h:493
utf8_pointer utf8_t
Convenience type for utf8_pointer strings.
Definition: unicode.h:561
static ucs4_t codepoint(const char *encoded)
Convert a utf8 encoded codepoint to a ucs4 character value.
strsize_t count(void) const
Count codepoints in current string.
Definition: unicode.h:355
size_t len(void) const
Get length of null terminated utf8 string in codepoints.
Definition: unicode.h:529
char at(int position) const
Return character found at a specific position in the string.
A core class of ut8 encoded string functions.
Definition: unicode.h:62
void cut(strsize_t offset, strsize_t size=0)
Cut (remove) text from string.
String operator()(int offset, strsize_t size) const
Get a new substring through object expression.
int32_t ucs4_t
32 bit unicode character code.
Definition: unicode.h:45
A copy-on-write string class that operates by reference count.
Definition: string.h:82
UString left(strsize_t size) const
Convenience method for left of string.
Definition: unicode.h:299
size_t operator()(unicode_t unicode, size_t size) const
Extract a unicode byte sequence from utf8 object.
Definition: unicode.h:283
strsize_t size(void) const
Get the size of currently allocated space for string.
static const char * nil
A convenient NULL pointer value.
Definition: unicode.h:73
UString right(strsize_t offset) const
Convenience method for right of string.
Definition: unicode.h:307
void paste(strsize_t offset, const char *text, strsize_t size=0)
Insert (paste) text into string.
void set(const char *text)
Set string object to text of a null terminated string.
ucs4_t operator*() const
Get unicode character pointed to by pointer.
Definition: unicode.h:508
ucs4_t at(int position) const
Return unicode character found at a specific codepoint in the string.
const char * find(const char *list, strsize_t offset=0) const
Find a character in the string.
UString copy(strsize_t offset, strsize_t size) const
Convenience method for substring extraction.
Definition: unicode.h:316
void start(JoinableThread *thread, int priority=0)
Convenience function to start a joinable thread.
Definition: thread.h:1822
bool operator!() const
Check if text is an invalid pointer.
Definition: unicode.h:461
A common string class and character string support functions.
void * unicode_t
Resolves issues where wchar_t is not defined.
Definition: unicode.h:55
A copy-on-write utf8 string class that operates by reference count.
Definition: unicode.h:193
const char * rfind(const char *list, strsize_t offset=npos) const
Find last occurrence of character in the string.
strsize_t ccount(const char *list) const
Count number of occurrences of characters in string.
Common character processing protocol.
Definition: protocols.h:174
void add(const char *text)
Append null terminated text to our string buffer.
unsigned short strsize_t
A convenience class for size of strings.
Definition: string.h:70
ObjectProtocol * copy(ObjectProtocol *object)
Convenience function to access object copy.
Definition: object.h:479
static const unsigned ucsize
Size of &quot;unicode_t&quot; character codes, may not be ucs4_t size.
Definition: unicode.h:68
bool operator!=(const char *string) const
check if pointer does not equal another string.
Definition: unicode.h:501
ucs4_t operator[](int position) const
Reference a unicode character in string object by array offset.
Definition: unicode.h:348
Pointer to utf8 encoded character data.
Definition: unicode.h:387
UString ustring_t
Convenience type for utf8 encoded strings.
Definition: unicode.h:556
static size_t count(const char *string)
Count ut8 encoded ucs4 codepoints in string.
char * c_str(void) const
Get c string we point to.
Definition: unicode.h:515
int16_t ucs2_t
16 bit unicode character code.
Definition: unicode.h:50