FONTAINE
1.0
|
00001 // 00002 // The Fontaine Font Analysis Project 00003 // 00004 // Copyright (c) 2009 by Edward H. Trager 00005 // All Rights Reserved 00006 // 00007 // Released under the GNU GPL version 2.0 or later. 00008 // 00009 00010 00012 // 00013 // This file was originally part of the MADELINE 2 program 00014 // written by Edward H. Trager and Ritu Khanna 00015 // Copyright (c) 2005 by the 00016 // Regents of the University of Michigan. 00017 // All Rights Reserved. 00018 // Released under the GNU General Public License v. 2.0 or later. 00019 // 00021 // 00022 // utf8String.h 00023 // 00024 // (c) 2006 by Edward H. Trager 00025 // released under the GNU General Public License 00026 // 00027 // This file was originally written for inclusion 00028 // in "Font Playground" . 00029 // 00030 // 2006.04.30.et. 00031 // LAST UPDATE: 2007.01.08 00032 // 00033 00034 #ifndef UTF8STRING_INCLUDED 00035 #define UTF8STRING_INCLUDED 00036 00037 #include "ScriptCodes.h" 00038 #include <string> 00039 00040 typedef unsigned long UTF32; // at least 32 bits 00041 typedef unsigned short UTF16; // at least 16 bits 00042 typedef unsigned char UTF8; 00043 00044 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 00045 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 00046 00047 // 00048 // The following are needed for UTF-16 conversion: 00049 // 00050 #define UNI_SUR_HIGH_START (UTF32)0xD800 00051 #define UNI_SUR_HIGH_END (UTF32)0xDBFF 00052 #define UNI_SUR_LOW_START (UTF32)0xDC00 00053 #define UNI_SUR_LOW_END (UTF32)0xDFFF 00054 00055 class UTF8String : public std::string { 00056 00057 00058 private: 00059 00060 const char *_UTF32ValueToUTF8( UTF32 UTF32Value ); 00061 00062 public: 00063 00064 // Default constructor just calls base class std::String(): 00065 UTF8String(); 00066 // Copy Constructors: 00067 UTF8String(const std::string &s); 00068 UTF8String(const UTF8String &s); 00069 // How many Unicode values are stored in the string?: 00070 unsigned int unicodeValueCount() const; 00071 // Get the Unicode substring starting at the "stt" unicode value -- 00072 // Note that stt=1 (*not* zero) returns the entire string: 00073 UTF8String unicodeSubString(unsigned int stt,unsigned int howManyCharacters=0) const; 00074 // Read-only bracket operator retrieves the nth unicode character -- 00075 // Note that pos=1 (*not* zero) specifies the first character: 00076 UTF8String operator[](unsigned int pos) const; 00077 // Return the Unicode code value of the nth Unicode character: 00078 UTF32 unicodeValueAtPosition(unsigned int pos=0) const; 00079 00080 // 00081 // Return a substring less than or equal to the howManyCharacters in 00082 // length where the end of the string is on a word boundary. 00083 // 00084 UTF8String unicodeSubStringOnWordBoundary(unsigned int stt,unsigned int howManyCharacters) const; 00085 00086 // Returns boolean TRUE if the string begins with a character 00087 // from a right-to-left script: 00088 bool isRTL(void) const; 00089 // Returns a boolean TRUE if the string begins with a character 00090 // from an Indic or Indic-derived script. Such scripts have 00091 // special complex text layout requirements: 00092 bool isIndic(void) const; 00093 00094 // Returns a boolean TRUE if the string begins with a character 00095 // from the Arabic script. This script has 00096 // special complex text layout requirements: 00097 bool isArabic(void) const; 00098 00099 // 00100 // Returns a script code based on the Unicode range of the first 00101 // character in the string: Currently only handles the Arabic and 00102 // Indic cases relevant for complex text layout 00103 // 00104 SCRIPTCODE getScriptCode(void); 00105 00106 // Returns a UTF32 String: 00107 std::basic_string<UTF32> UTF32String() const; 00108 00109 // 00110 // Append and Derived Overloaded Assignment operators: 00111 // 00112 UTF8String& append( const std::basic_string<UTF32> &UTF32String ); 00113 UTF8String& append( const std::basic_string<UTF16> &UTF16String ); 00114 00115 UTF8String& operator+=( const std::basic_string<UTF32> &UTF32String ); 00116 UTF8String& operator+=( const std::basic_string<UTF16> &UTF16String ); 00117 00118 UTF8String& operator=( const std::basic_string<UTF32> &UTF32String ); 00119 UTF8String& operator=( const std::basic_string<UTF16> &UTF16String ); 00120 00121 // 00122 // Specialized constructors: 00123 // 00124 // Construct a UTF8String from a UTF32 or UTF16 string: 00125 // 00126 // These also ultimately use the append() methods from above: 00127 // 00128 UTF8String( const std::basic_string<UTF32> &UTF32String ); 00129 UTF8String( const std::basic_string<UTF16> &UTF16String ); 00130 00131 }; 00132 00133 #endif 00134