Botan  1.10.9
charset.cpp
Go to the documentation of this file.
1 /*
2 * Character Set Handling
3 * (C) 1999-2007 Jack Lloyd
4 *
5 * Distributed under the terms of the Botan license
6 */
7 
8 #include <botan/charset.h>
9 #include <botan/parsing.h>
10 #include <botan/exceptn.h>
11 #include <cctype>
12 
13 namespace Botan {
14 
15 namespace Charset {
16 
17 namespace {
18 
19 /*
20 * Convert from UCS-2 to ISO 8859-1
21 */
22 std::string ucs2_to_latin1(const std::string& ucs2)
23  {
24  if(ucs2.size() % 2 == 1)
25  throw Decoding_Error("UCS-2 string has an odd number of bytes");
26 
27  std::string latin1;
28 
29  for(size_t i = 0; i != ucs2.size(); i += 2)
30  {
31  const byte c1 = ucs2[i];
32  const byte c2 = ucs2[i+1];
33 
34  if(c1 != 0)
35  throw Decoding_Error("UCS-2 has non-Latin1 characters");
36 
37  latin1 += static_cast<char>(c2);
38  }
39 
40  return latin1;
41  }
42 
43 /*
44 * Convert from UTF-8 to ISO 8859-1
45 */
46 std::string utf8_to_latin1(const std::string& utf8)
47  {
48  std::string iso8859;
49 
50  size_t position = 0;
51  while(position != utf8.size())
52  {
53  const byte c1 = static_cast<byte>(utf8[position++]);
54 
55  if(c1 <= 0x7F)
56  iso8859 += static_cast<char>(c1);
57  else if(c1 >= 0xC0 && c1 <= 0xC7)
58  {
59  if(position == utf8.size())
60  throw Decoding_Error("UTF-8: sequence truncated");
61 
62  const byte c2 = static_cast<byte>(utf8[position++]);
63  const byte iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
64 
65  if(iso_char <= 0x7F)
66  throw Decoding_Error("UTF-8: sequence longer than needed");
67 
68  iso8859 += static_cast<char>(iso_char);
69  }
70  else
71  throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
72  }
73 
74  return iso8859;
75  }
76 
77 /*
78 * Convert from ISO 8859-1 to UTF-8
79 */
80 std::string latin1_to_utf8(const std::string& iso8859)
81  {
82  std::string utf8;
83  for(size_t i = 0; i != iso8859.size(); ++i)
84  {
85  const byte c = static_cast<byte>(iso8859[i]);
86 
87  if(c <= 0x7F)
88  utf8 += static_cast<char>(c);
89  else
90  {
91  utf8 += static_cast<char>((0xC0 | (c >> 6)));
92  utf8 += static_cast<char>((0x80 | (c & 0x3F)));
93  }
94  }
95  return utf8;
96  }
97 
98 }
99 
100 /*
101 * Perform character set transcoding
102 */
103 std::string transcode(const std::string& str,
104  Character_Set to, Character_Set from)
105  {
106  if(to == LOCAL_CHARSET)
107  to = LATIN1_CHARSET;
108  if(from == LOCAL_CHARSET)
109  from = LATIN1_CHARSET;
110 
111  if(to == from)
112  return str;
113 
114  if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
115  return latin1_to_utf8(str);
116  if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
117  return utf8_to_latin1(str);
118  if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
119  return ucs2_to_latin1(str);
120 
121  throw Invalid_Argument("Unknown transcoding operation from " +
122  to_string(from) + " to " + to_string(to));
123  }
124 
125 /*
126 * Check if a character represents a digit
127 */
128 bool is_digit(char c)
129  {
130  if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
131  c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
132  return true;
133  return false;
134  }
135 
136 /*
137 * Check if a character represents whitespace
138 */
139 bool is_space(char c)
140  {
141  if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
142  return true;
143  return false;
144  }
145 
146 /*
147 * Convert a character to a digit
148 */
150  {
151  switch(c)
152  {
153  case '0': return 0;
154  case '1': return 1;
155  case '2': return 2;
156  case '3': return 3;
157  case '4': return 4;
158  case '5': return 5;
159  case '6': return 6;
160  case '7': return 7;
161  case '8': return 8;
162  case '9': return 9;
163  }
164 
165  throw Invalid_Argument("char2digit: Input is not a digit character");
166  }
167 
168 /*
169 * Convert a digit to a character
170 */
172  {
173  switch(b)
174  {
175  case 0: return '0';
176  case 1: return '1';
177  case 2: return '2';
178  case 3: return '3';
179  case 4: return '4';
180  case 5: return '5';
181  case 6: return '6';
182  case 7: return '7';
183  case 8: return '8';
184  case 9: return '9';
185  }
186 
187  throw Invalid_Argument("digit2char: Input is not a digit");
188  }
189 
190 /*
191 * Case-insensitive character comparison
192 */
193 bool caseless_cmp(char a, char b)
194  {
195  return (std::tolower(static_cast<unsigned char>(a)) ==
196  std::tolower(static_cast<unsigned char>(b)));
197  }
198 
199 }
200 
201 }
bool caseless_cmp(char a, char b)
Definition: charset.cpp:193
std::invalid_argument Invalid_Argument
Definition: exceptn.h:20
Character_Set
Definition: charset.h:19
unsigned char byte
Definition: types.h:22
std::string transcode(const std::string &str, Character_Set to, Character_Set from)
Definition: charset.cpp:103
byte char2digit(char c)
Definition: charset.cpp:149
bool is_digit(char c)
Definition: charset.cpp:128
bool is_space(char c)
Definition: charset.cpp:139
std::string to_string(u64bit n, size_t min_len)
Definition: parsing.cpp:42
char digit2char(byte b)
Definition: charset.cpp:171