1 | <?php |
---|
2 | |
---|
3 | /****************************************************************************** |
---|
4 | * |
---|
5 | * Filename: Unicode.php |
---|
6 | * |
---|
7 | * Description: Provides functions for handling Unicode strings in PHP without |
---|
8 | * needing to configure the non-default mbstring extension |
---|
9 | * |
---|
10 | * Author: Evan Hunter |
---|
11 | * |
---|
12 | * Date: 27/7/2004 |
---|
13 | * |
---|
14 | * Project: JPEG Metadata |
---|
15 | * |
---|
16 | * Revision: 1.10 |
---|
17 | * |
---|
18 | * Changes: 1.00 -> 1.10 : Added the following functions: |
---|
19 | * smart_HTML_Entities |
---|
20 | * smart_htmlspecialchars |
---|
21 | * HTML_UTF16_UnEscape |
---|
22 | * HTML_UTF8_UnEscape |
---|
23 | * changed HTML_UTF8_Escape and HTML_UTF16_Escape to |
---|
24 | * use smart_htmlspecialchars, so that characters which |
---|
25 | * were already escaped would remain intact |
---|
26 | * |
---|
27 | * |
---|
28 | * URL: http://electronics.ozhiker.com |
---|
29 | * |
---|
30 | * License: This file is part of the PHP JPEG Metadata Toolkit. |
---|
31 | * |
---|
32 | * The PHP JPEG Metadata Toolkit is free software; you can |
---|
33 | * redistribute it and/or modify it under the terms of the |
---|
34 | * GNU General Public License as published by the Free Software |
---|
35 | * Foundation; either version 2 of the License, or (at your |
---|
36 | * option) any later version. |
---|
37 | * |
---|
38 | * The PHP JPEG Metadata Toolkit is distributed in the hope |
---|
39 | * that it will be useful, but WITHOUT ANY WARRANTY; without |
---|
40 | * even the implied warranty of MERCHANTABILITY or FITNESS |
---|
41 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License |
---|
42 | * for more details. |
---|
43 | * |
---|
44 | * You should have received a copy of the GNU General Public |
---|
45 | * License along with the PHP JPEG Metadata Toolkit; if not, |
---|
46 | * write to the Free Software Foundation, Inc., 59 Temple |
---|
47 | * Place, Suite 330, Boston, MA 02111-1307 USA |
---|
48 | * |
---|
49 | * If you require a different license for commercial or other |
---|
50 | * purposes, please contact the author: evan@ozhiker.com |
---|
51 | * |
---|
52 | ******************************************************************************/ |
---|
53 | |
---|
54 | |
---|
55 | // TODO: UTF-16 functions have not been tested fully |
---|
56 | |
---|
57 | |
---|
58 | |
---|
59 | /****************************************************************************** |
---|
60 | * |
---|
61 | * Unicode UTF-8 Encoding Functions |
---|
62 | * |
---|
63 | * Description: UTF-8 is a Unicode encoding system in which extended characters |
---|
64 | * use only the upper half (128 values) of the byte range, thus it |
---|
65 | * allows the use of normal 7-bit ASCII text. |
---|
66 | * 7-Bit ASCII will pass straight through UTF-8 encoding/decoding without change |
---|
67 | * |
---|
68 | * |
---|
69 | * The encoding is as follows: |
---|
70 | * Unicode Value : Binary representation (x=data bit) |
---|
71 | *-------------------------------------------------------------------------------- |
---|
72 | * U-00000000 - U-0000007F: 0xxxxxxx <- This is 7-bit ASCII |
---|
73 | * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx |
---|
74 | * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
---|
75 | * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
---|
76 | * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
---|
77 | * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
---|
78 | *-------------------------------------------------------------------------------- |
---|
79 | * |
---|
80 | ******************************************************************************/ |
---|
81 | |
---|
82 | |
---|
83 | |
---|
84 | |
---|
85 | /****************************************************************************** |
---|
86 | * |
---|
87 | * Unicode UTF-16 Encoding Functions |
---|
88 | * |
---|
89 | * Description: UTF-16 is a Unicode encoding system uses 16 bit values for representing |
---|
90 | * characters. |
---|
91 | * It also has an extended set of characters available by the use |
---|
92 | * of surrogate pairs, which are a pair of 16 bit values, giving a |
---|
93 | * total data length of 20 useful bits. |
---|
94 | * |
---|
95 | * |
---|
96 | * The encoding is as follows: |
---|
97 | * Unicode Value : Binary representation (x=data bit) |
---|
98 | *-------------------------------------------------------------------------------- |
---|
99 | * U-000000 - U-00D7FF: xxxxxxxx xxxxxxxx |
---|
100 | * U-00D800 - U-00DBFF: Not available - used for high surrogate pairs |
---|
101 | * U-00DC00 - U-00DFFF: Not available - used for low surrogate pairs |
---|
102 | U-00E000 - U-00FFFF: xxxxxxxx xxxxxxxx |
---|
103 | * U-010000 - U-10FFFF: 110110ww wwxxxxxx 110111xx xxxxxxxx ( wwww = (uni-0x10000)/0x10000 ) |
---|
104 | *-------------------------------------------------------------------------------- |
---|
105 | * |
---|
106 | * Surrogate pair Calculations |
---|
107 | * |
---|
108 | * $hi = ($uni - 0x10000) / 0x400 + 0xD800; |
---|
109 | * $lo = ($uni - 0x10000) % 0x400 + 0xDC00; |
---|
110 | * |
---|
111 | * |
---|
112 | * $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); |
---|
113 | * |
---|
114 | * |
---|
115 | ******************************************************************************/ |
---|
116 | |
---|
117 | |
---|
118 | |
---|
119 | |
---|
120 | |
---|
121 | |
---|
122 | /****************************************************************************** |
---|
123 | * |
---|
124 | * Function: UTF8_fix |
---|
125 | * |
---|
126 | * Description: Checks a string for badly formed Unicode UTF-8 coding and |
---|
127 | * returns the same string containing only the parts which |
---|
128 | * were properly formed UTF-8 data. |
---|
129 | * |
---|
130 | * Parameters: utf8_text - a string with possibly badly formed UTF-8 data |
---|
131 | * |
---|
132 | * Returns: output - the well formed UTF-8 version of the string |
---|
133 | * |
---|
134 | ******************************************************************************/ |
---|
135 | |
---|
136 | function UTF8_fix( $utf8_text ) |
---|
137 | { |
---|
138 | // Initialise the current position in the string |
---|
139 | $pos = 0; |
---|
140 | |
---|
141 | // Create a string to accept the well formed output |
---|
142 | $output = "" ; |
---|
143 | |
---|
144 | // Cycle through each group of bytes, ensuring the coding is correct |
---|
145 | while ( $pos < strlen( $utf8_text ) ) |
---|
146 | { |
---|
147 | // Retreive the current numerical character value |
---|
148 | $chval = ord($utf8_text{$pos}); |
---|
149 | |
---|
150 | // Check what the first character is - it will tell us how many bytes the |
---|
151 | // Unicode value covers |
---|
152 | |
---|
153 | if ( ( $chval >= 0x00 ) && ( $chval <= 0x7F ) ) |
---|
154 | { |
---|
155 | // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character |
---|
156 | $bytes = 1; |
---|
157 | } |
---|
158 | else if ( ( $chval >= 0xC0 ) && ( $chval <= 0xDF ) ) |
---|
159 | { |
---|
160 | // 2 Byte UTF-8 Unicode Character |
---|
161 | $bytes = 2; |
---|
162 | } |
---|
163 | else if ( ( $chval >= 0xE0 ) && ( $chval <= 0xEF ) ) |
---|
164 | { |
---|
165 | // 3 Byte UTF-8 Unicode Character |
---|
166 | $bytes = 3; |
---|
167 | } |
---|
168 | else if ( ( $chval >= 0xF0 ) && ( $chval <= 0xF7 ) ) |
---|
169 | { |
---|
170 | // 4 Byte UTF-8 Unicode Character |
---|
171 | $bytes = 4; |
---|
172 | } |
---|
173 | else if ( ( $chval >= 0xF8 ) && ( $chval <= 0xFB ) ) |
---|
174 | { |
---|
175 | // 5 Byte UTF-8 Unicode Character |
---|
176 | $bytes = 5; |
---|
177 | } |
---|
178 | else if ( ( $chval >= 0xFC ) && ( $chval <= 0xFD ) ) |
---|
179 | { |
---|
180 | // 6 Byte UTF-8 Unicode Character |
---|
181 | $bytes = 6; |
---|
182 | } |
---|
183 | else |
---|
184 | { |
---|
185 | // Invalid Code - skip character and do nothing |
---|
186 | $bytes = 0; |
---|
187 | $pos++; |
---|
188 | } |
---|
189 | |
---|
190 | |
---|
191 | // check that there is enough data remaining to read |
---|
192 | if (($pos + $bytes - 1) < strlen( $utf8_text ) ) |
---|
193 | { |
---|
194 | // Cycle through the number of bytes specified, |
---|
195 | // copying them to the output string |
---|
196 | while ( $bytes > 0 ) |
---|
197 | { |
---|
198 | $output .= $utf8_text{$pos}; |
---|
199 | $pos++; |
---|
200 | $bytes--; |
---|
201 | } |
---|
202 | } |
---|
203 | else |
---|
204 | { |
---|
205 | break; |
---|
206 | } |
---|
207 | } |
---|
208 | |
---|
209 | // Return the result |
---|
210 | return $output; |
---|
211 | } |
---|
212 | |
---|
213 | /****************************************************************************** |
---|
214 | * End of Function: UTF8_fix |
---|
215 | ******************************************************************************/ |
---|
216 | |
---|
217 | |
---|
218 | |
---|
219 | |
---|
220 | |
---|
221 | |
---|
222 | |
---|
223 | |
---|
224 | |
---|
225 | /****************************************************************************** |
---|
226 | * |
---|
227 | * Function: UTF16_fix |
---|
228 | * |
---|
229 | * Description: Checks a string for badly formed Unicode UTF-16 coding and |
---|
230 | * returns the same string containing only the parts which |
---|
231 | * were properly formed UTF-16 data. |
---|
232 | * |
---|
233 | * Parameters: utf16_text - a string with possibly badly formed UTF-16 data |
---|
234 | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) |
---|
235 | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) |
---|
236 | * |
---|
237 | * Returns: output - the well formed UTF-16 version of the string |
---|
238 | * |
---|
239 | ******************************************************************************/ |
---|
240 | |
---|
241 | function UTF16_fix( $utf16_text, $MSB_first ) |
---|
242 | { |
---|
243 | // Initialise the current position in the string |
---|
244 | $pos = 0; |
---|
245 | |
---|
246 | // Create a string to accept the well formed output |
---|
247 | $output = "" ; |
---|
248 | |
---|
249 | // Cycle through each group of bytes, ensuring the coding is correct |
---|
250 | while ( $pos < strlen( $utf16_text ) ) |
---|
251 | { |
---|
252 | // Retreive the current numerical character value |
---|
253 | $chval1 = ord($utf16_text{$pos}); |
---|
254 | |
---|
255 | // Skip over character just read |
---|
256 | $pos++; |
---|
257 | |
---|
258 | // Check if there is another character available |
---|
259 | if ( $pos < strlen( $utf16_text ) ) |
---|
260 | { |
---|
261 | // Another character is available - get it for the second half of the UTF-16 value |
---|
262 | $chval2 = ord( $utf16_text{$pos} ); |
---|
263 | } |
---|
264 | else |
---|
265 | { |
---|
266 | // Error - no second byte to this UTF-16 value - end processing |
---|
267 | continue 1; |
---|
268 | } |
---|
269 | |
---|
270 | // Skip over character just read |
---|
271 | $pos++; |
---|
272 | |
---|
273 | // Calculate the 16 bit unicode value |
---|
274 | if ( $MSB_first ) |
---|
275 | { |
---|
276 | // Big Endian |
---|
277 | $UTF16_val = $chval1 * 0x100 + $chval2; |
---|
278 | } |
---|
279 | else |
---|
280 | { |
---|
281 | // Little Endian |
---|
282 | $UTF16_val = $chval2 * 0x100 + $chval1; |
---|
283 | } |
---|
284 | |
---|
285 | |
---|
286 | |
---|
287 | if ( ( ( $UTF16_val >= 0x0000 ) && ( $UTF16_val <= 0xD7FF ) ) || |
---|
288 | ( ( $UTF16_val >= 0xE000 ) && ( $UTF16_val <= 0xFFFF ) ) ) |
---|
289 | { |
---|
290 | // Normal Character (Non Surrogate pair) |
---|
291 | // Add it to the output |
---|
292 | $output .= chr( $chval1 ) . chr ( $chval2 ); |
---|
293 | } |
---|
294 | else if ( ( $UTF16_val >= 0xD800 ) && ( $UTF16_val <= 0xDBFF ) ) |
---|
295 | { |
---|
296 | // High surrogate of a surrogate pair |
---|
297 | // Now we need to read the low surrogate |
---|
298 | // Check if there is another 2 characters available |
---|
299 | if ( ( $pos + 3 ) < strlen( $utf16_text ) ) |
---|
300 | { |
---|
301 | // Another 2 characters are available - get them |
---|
302 | $chval3 = ord( $utf16_text{$pos} ); |
---|
303 | $chval4 = ord( $utf16_text{$pos+1} ); |
---|
304 | |
---|
305 | // Calculate the second 16 bit unicode value |
---|
306 | if ( $MSB_first ) |
---|
307 | { |
---|
308 | // Big Endian |
---|
309 | $UTF16_val2 = $chval3 * 0x100 + $chval4; |
---|
310 | } |
---|
311 | else |
---|
312 | { |
---|
313 | // Little Endian |
---|
314 | $UTF16_val2 = $chval4 * 0x100 + $chval3; |
---|
315 | } |
---|
316 | |
---|
317 | // Check that this is a low surrogate |
---|
318 | if ( ( $UTF16_val2 >= 0xDC00 ) && ( $UTF16_val2 <= 0xDFFF ) ) |
---|
319 | { |
---|
320 | // Low surrogate found following high surrogate |
---|
321 | // Add both to the output |
---|
322 | $output .= chr( $chval1 ) . chr ( $chval2 ) . chr( $chval3 ) . chr ( $chval4 ); |
---|
323 | |
---|
324 | // Skip over the low surrogate |
---|
325 | $pos += 2; |
---|
326 | } |
---|
327 | else |
---|
328 | { |
---|
329 | // Low surrogate not found after high surrogate |
---|
330 | // Don't add either to the output |
---|
331 | // Only the High surrogate is skipped and processing continues after it |
---|
332 | } |
---|
333 | |
---|
334 | } |
---|
335 | else |
---|
336 | { |
---|
337 | // Error - not enough data for low surrogate - end processing |
---|
338 | continue 1; |
---|
339 | } |
---|
340 | |
---|
341 | } |
---|
342 | else |
---|
343 | { |
---|
344 | // Low surrogate of a surrogate pair |
---|
345 | // This should not happen - it means this is a lone low surrogate |
---|
346 | // Dont add it to the output |
---|
347 | } |
---|
348 | |
---|
349 | } |
---|
350 | |
---|
351 | // Return the result |
---|
352 | return $output; |
---|
353 | } |
---|
354 | |
---|
355 | /****************************************************************************** |
---|
356 | * End of Function: UTF16_fix |
---|
357 | ******************************************************************************/ |
---|
358 | |
---|
359 | |
---|
360 | |
---|
361 | |
---|
362 | |
---|
363 | /****************************************************************************** |
---|
364 | * |
---|
365 | * Function: UTF8_to_unicode_array |
---|
366 | * |
---|
367 | * Description: Converts a string encoded with Unicode UTF-8, to an array of |
---|
368 | * numbers which represent unicode character numbers |
---|
369 | * |
---|
370 | * Parameters: utf8_text - a string containing the UTF-8 data |
---|
371 | * |
---|
372 | * Returns: output - the array containing the unicode character numbers |
---|
373 | * |
---|
374 | ******************************************************************************/ |
---|
375 | |
---|
376 | function UTF8_to_unicode_array( $utf8_text ) |
---|
377 | { |
---|
378 | // Create an array to receive the unicode character numbers output |
---|
379 | $output = array( ); |
---|
380 | |
---|
381 | // Cycle through the characters in the UTF-8 string |
---|
382 | for ( $pos = 0; $pos < strlen( $utf8_text ); $pos++ ) |
---|
383 | { |
---|
384 | // Retreive the current numerical character value |
---|
385 | $chval = ord($utf8_text{$pos}); |
---|
386 | |
---|
387 | // Check what the first character is - it will tell us how many bytes the |
---|
388 | // Unicode value covers |
---|
389 | |
---|
390 | if ( ( $chval >= 0x00 ) && ( $chval <= 0x7F ) ) |
---|
391 | { |
---|
392 | // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character |
---|
393 | $bytes = 1; |
---|
394 | $outputval = $chval; // Since 7-bit ASCII is unaffected, the output equals the input |
---|
395 | } |
---|
396 | else if ( ( $chval >= 0xC0 ) && ( $chval <= 0xDF ) ) |
---|
397 | { |
---|
398 | // 2 Byte UTF-8 Unicode |
---|
399 | $bytes = 2; |
---|
400 | $outputval = $chval & 0x1F; // The first byte is bitwise ANDed with 0x1F to remove the leading 110b |
---|
401 | } |
---|
402 | else if ( ( $chval >= 0xE0 ) && ( $chval <= 0xEF ) ) |
---|
403 | { |
---|
404 | // 3 Byte UTF-8 Unicode |
---|
405 | $bytes = 3; |
---|
406 | $outputval = $chval & 0x0F; // The first byte is bitwise ANDed with 0x0F to remove the leading 1110b |
---|
407 | } |
---|
408 | else if ( ( $chval >= 0xF0 ) && ( $chval <= 0xF7 ) ) |
---|
409 | { |
---|
410 | // 4 Byte UTF-8 Unicode |
---|
411 | $bytes = 4; |
---|
412 | $outputval = $chval & 0x07; // The first byte is bitwise ANDed with 0x07 to remove the leading 11110b |
---|
413 | } |
---|
414 | else if ( ( $chval >= 0xF8 ) && ( $chval <= 0xFB ) ) |
---|
415 | { |
---|
416 | // 5 Byte UTF-8 Unicode |
---|
417 | $bytes = 5; |
---|
418 | $outputval = $chval & 0x03; // The first byte is bitwise ANDed with 0x03 to remove the leading 111110b |
---|
419 | } |
---|
420 | else if ( ( $chval >= 0xFC ) && ( $chval <= 0xFD ) ) |
---|
421 | { |
---|
422 | // 6 Byte UTF-8 Unicode |
---|
423 | $bytes = 6; |
---|
424 | $outputval = $chval & 0x01; // The first byte is bitwise ANDed with 0x01 to remove the leading 1111110b |
---|
425 | } |
---|
426 | else |
---|
427 | { |
---|
428 | // Invalid Code - do nothing |
---|
429 | $bytes = 0; |
---|
430 | } |
---|
431 | |
---|
432 | // Check if the byte was valid |
---|
433 | if ( $bytes !== 0 ) |
---|
434 | { |
---|
435 | // The byte was valid |
---|
436 | |
---|
437 | // Check if there is enough data left in the UTF-8 string to allow the |
---|
438 | // retrieval of the remainder of this unicode character |
---|
439 | if ( $pos + $bytes - 1 < strlen( $utf8_text ) ) |
---|
440 | { |
---|
441 | // The UTF-8 string is long enough |
---|
442 | |
---|
443 | // Cycle through the number of bytes required, |
---|
444 | // minus the first one which has already been done |
---|
445 | while ( $bytes > 1 ) |
---|
446 | { |
---|
447 | $pos++; |
---|
448 | $bytes--; |
---|
449 | |
---|
450 | // Each remaining byte is coded with 6 bits of data and 10b on the high |
---|
451 | // order bits. Hence we need to shift left by 6 bits (0x40) then add the |
---|
452 | // current characer after it has been bitwise ANDed with 0x3F to remove the |
---|
453 | // highest two bits. |
---|
454 | $outputval = $outputval*0x40 + ( (ord($utf8_text{$pos})) & 0x3F ); |
---|
455 | } |
---|
456 | |
---|
457 | // Add the calculated Unicode number to the output array |
---|
458 | $output[] = $outputval; |
---|
459 | } |
---|
460 | } |
---|
461 | |
---|
462 | } |
---|
463 | |
---|
464 | // Return the resulting array |
---|
465 | return $output; |
---|
466 | } |
---|
467 | |
---|
468 | /****************************************************************************** |
---|
469 | * End of Function: UTF8_to_unicode_array |
---|
470 | ******************************************************************************/ |
---|
471 | |
---|
472 | |
---|
473 | |
---|
474 | |
---|
475 | |
---|
476 | /****************************************************************************** |
---|
477 | * |
---|
478 | * Function: UTF16_to_unicode_array |
---|
479 | * |
---|
480 | * Description: Converts a string encoded with Unicode UTF-16, to an array of |
---|
481 | * numbers which represent unicode character numbers |
---|
482 | * |
---|
483 | * Parameters: utf16_text - a string containing the UTF-16 data |
---|
484 | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) |
---|
485 | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) |
---|
486 | * |
---|
487 | * Returns: output - the array containing the unicode character numbers |
---|
488 | * |
---|
489 | ******************************************************************************/ |
---|
490 | |
---|
491 | function UTF16_to_unicode_array( $utf16_text, $MSB_first ) |
---|
492 | { |
---|
493 | // Create an array to receive the unicode character numbers output |
---|
494 | $output = array( ); |
---|
495 | |
---|
496 | |
---|
497 | // Initialise the current position in the string |
---|
498 | $pos = 0; |
---|
499 | |
---|
500 | // Cycle through each group of bytes, ensuring the coding is correct |
---|
501 | while ( $pos < strlen( $utf16_text ) ) |
---|
502 | { |
---|
503 | // Retreive the current numerical character value |
---|
504 | $chval1 = ord($utf16_text{$pos}); |
---|
505 | |
---|
506 | // Skip over character just read |
---|
507 | $pos++; |
---|
508 | |
---|
509 | // Check if there is another character available |
---|
510 | if ( $pos < strlen( $utf16_text ) ) |
---|
511 | { |
---|
512 | // Another character is available - get it for the second half of the UTF-16 value |
---|
513 | $chval2 = ord( $utf16_text{$pos} ); |
---|
514 | } |
---|
515 | else |
---|
516 | { |
---|
517 | // Error - no second byte to this UTF-16 value - end processing |
---|
518 | continue 1; |
---|
519 | } |
---|
520 | |
---|
521 | // Skip over character just read |
---|
522 | $pos++; |
---|
523 | |
---|
524 | // Calculate the 16 bit unicode value |
---|
525 | if ( $MSB_first ) |
---|
526 | { |
---|
527 | // Big Endian |
---|
528 | $UTF16_val = $chval1 * 0x100 + $chval2; |
---|
529 | } |
---|
530 | else |
---|
531 | { |
---|
532 | // Little Endian |
---|
533 | $UTF16_val = $chval2 * 0x100 + $chval1; |
---|
534 | } |
---|
535 | |
---|
536 | |
---|
537 | if ( ( ( $UTF16_val >= 0x0000 ) && ( $UTF16_val <= 0xD7FF ) ) || |
---|
538 | ( ( $UTF16_val >= 0xE000 ) && ( $UTF16_val <= 0xFFFF ) ) ) |
---|
539 | { |
---|
540 | // Normal Character (Non Surrogate pair) |
---|
541 | // Add it to the output |
---|
542 | $output[] = $UTF16_val; |
---|
543 | } |
---|
544 | else if ( ( $UTF16_val >= 0xD800 ) && ( $UTF16_val <= 0xDBFF ) ) |
---|
545 | { |
---|
546 | // High surrogate of a surrogate pair |
---|
547 | // Now we need to read the low surrogate |
---|
548 | // Check if there is another 2 characters available |
---|
549 | if ( ( $pos + 3 ) < strlen( $utf16_text ) ) |
---|
550 | { |
---|
551 | // Another 2 characters are available - get them |
---|
552 | $chval3 = ord( $utf16_text{$pos} ); |
---|
553 | $chval4 = ord( $utf16_text{$pos+1} ); |
---|
554 | |
---|
555 | // Calculate the second 16 bit unicode value |
---|
556 | if ( $MSB_first ) |
---|
557 | { |
---|
558 | // Big Endian |
---|
559 | $UTF16_val2 = $chval3 * 0x100 + $chval4; |
---|
560 | } |
---|
561 | else |
---|
562 | { |
---|
563 | // Little Endian |
---|
564 | $UTF16_val2 = $chval4 * 0x100 + $chval3; |
---|
565 | } |
---|
566 | |
---|
567 | // Check that this is a low surrogate |
---|
568 | if ( ( $UTF16_val2 >= 0xDC00 ) && ( $UTF16_val2 <= 0xDFFF ) ) |
---|
569 | { |
---|
570 | // Low surrogate found following high surrogate |
---|
571 | // Add both to the output |
---|
572 | $output[] = 0x10000 + ( ( $UTF16_val - 0xD800 ) * 0x400 ) + ( $UTF16_val2 - 0xDC00 ); |
---|
573 | |
---|
574 | // Skip over the low surrogate |
---|
575 | $pos += 2; |
---|
576 | } |
---|
577 | else |
---|
578 | { |
---|
579 | // Low surrogate not found after high surrogate |
---|
580 | // Don't add either to the output |
---|
581 | // The high surrogate is skipped and processing continued |
---|
582 | } |
---|
583 | |
---|
584 | } |
---|
585 | else |
---|
586 | { |
---|
587 | // Error - not enough data for low surrogate - end processing |
---|
588 | continue 1; |
---|
589 | } |
---|
590 | |
---|
591 | } |
---|
592 | else |
---|
593 | { |
---|
594 | // Low surrogate of a surrogate pair |
---|
595 | // This should not happen - it means this is a lone low surrogate |
---|
596 | // Don't add it to the output |
---|
597 | } |
---|
598 | |
---|
599 | } |
---|
600 | |
---|
601 | // Return the result |
---|
602 | return $output; |
---|
603 | |
---|
604 | |
---|
605 | } |
---|
606 | |
---|
607 | /****************************************************************************** |
---|
608 | * End of Function: UTF16_to_unicode_array |
---|
609 | ******************************************************************************/ |
---|
610 | |
---|
611 | |
---|
612 | |
---|
613 | |
---|
614 | |
---|
615 | |
---|
616 | |
---|
617 | /****************************************************************************** |
---|
618 | * |
---|
619 | * Function: unicode_array_to_UTF8 |
---|
620 | * |
---|
621 | * Description: Converts an array of unicode character numbers to a string |
---|
622 | * encoded by UTF-8 |
---|
623 | * |
---|
624 | * Parameters: unicode_array - the array containing unicode character numbers |
---|
625 | * |
---|
626 | * Returns: output - the UTF-8 encoded string representing the data |
---|
627 | * |
---|
628 | ******************************************************************************/ |
---|
629 | |
---|
630 | function unicode_array_to_UTF8( $unicode_array ) |
---|
631 | { |
---|
632 | |
---|
633 | // Create a string to receive the UTF-8 output |
---|
634 | $output = ""; |
---|
635 | |
---|
636 | // Cycle through each Unicode character number |
---|
637 | foreach( $unicode_array as $unicode_char ) |
---|
638 | { |
---|
639 | // Check which range the current unicode character lies in |
---|
640 | if ( ( $unicode_char >= 0x00 ) && ( $unicode_char <= 0x7F ) ) |
---|
641 | { |
---|
642 | // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character |
---|
643 | |
---|
644 | $output .= chr($unicode_char); // Output is equal to input for 7-bit ASCII |
---|
645 | } |
---|
646 | else if ( ( $unicode_char >= 0x80 ) && ( $unicode_char <= 0x7FF ) ) |
---|
647 | { |
---|
648 | // 2 Byte UTF-8 Unicode - binary encode data as : 110xxxxx 10xxxxxx |
---|
649 | |
---|
650 | $output .= chr(0xC0 + ($unicode_char/0x40)); |
---|
651 | $output .= chr(0x80 + ($unicode_char & 0x3F)); |
---|
652 | } |
---|
653 | else if ( ( $unicode_char >= 0x800 ) && ( $unicode_char <= 0xFFFF ) ) |
---|
654 | { |
---|
655 | // 3 Byte UTF-8 Unicode - binary encode data as : 1110xxxx 10xxxxxx 10xxxxxx |
---|
656 | |
---|
657 | $output .= chr(0xE0 + ($unicode_char/0x1000)); |
---|
658 | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); |
---|
659 | $output .= chr(0x80 + ($unicode_char & 0x3F)); |
---|
660 | } |
---|
661 | else if ( ( $unicode_char >= 0x10000 ) && ( $unicode_char <= 0x1FFFFF ) ) |
---|
662 | { |
---|
663 | // 4 Byte UTF-8 Unicode - binary encode data as : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
---|
664 | |
---|
665 | $output .= chr(0xF0 + ($unicode_char/0x40000)); |
---|
666 | $output .= chr(0x80 + (($unicode_char/0x1000) & 0x3F)); |
---|
667 | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); |
---|
668 | $output .= chr(0x80 + ($unicode_char & 0x3F)); |
---|
669 | } |
---|
670 | else if ( ( $unicode_char >= 0x200000 ) && ( $unicode_char <= 0x3FFFFFF ) ) |
---|
671 | { |
---|
672 | // 5 Byte UTF-8 Unicode - binary encode data as : 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
---|
673 | |
---|
674 | $output .= chr(0xF8 + ($unicode_char/0x1000000)); |
---|
675 | $output .= chr(0x80 + (($unicode_char/0x40000) & 0x3F)); |
---|
676 | $output .= chr(0x80 + (($unicode_char/0x1000) & 0x3F)); |
---|
677 | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); |
---|
678 | $output .= chr(0x80 + ($unicode_char & 0x3F)); |
---|
679 | } |
---|
680 | else if ( ( $unicode_char >= 0x4000000 ) && ( $unicode_char <= 0x7FFFFFFF ) ) |
---|
681 | { |
---|
682 | // 6 Byte UTF-8 Unicode - binary encode data as : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
---|
683 | |
---|
684 | $output .= chr(0xFC + ($unicode_char/0x40000000)); |
---|
685 | $output .= chr(0x80 + (($unicode_char/0x1000000) & 0x3F)); |
---|
686 | $output .= chr(0x80 + (($unicode_char/0x40000) & 0x3F)); |
---|
687 | $output .= chr(0x80 + (($unicode_char/0x1000) & 0x3F)); |
---|
688 | $output .= chr(0x80 + (($unicode_char/0x40) & 0x3F)); |
---|
689 | $output .= chr(0x80 + ($unicode_char & 0x3F)); |
---|
690 | } |
---|
691 | else |
---|
692 | { |
---|
693 | // Invalid Code - do nothing |
---|
694 | } |
---|
695 | |
---|
696 | } |
---|
697 | |
---|
698 | // Return resulting UTF-8 String |
---|
699 | return $output; |
---|
700 | } |
---|
701 | |
---|
702 | /****************************************************************************** |
---|
703 | * End of Function: unicode_array_to_UTF8 |
---|
704 | ******************************************************************************/ |
---|
705 | |
---|
706 | |
---|
707 | |
---|
708 | |
---|
709 | |
---|
710 | |
---|
711 | |
---|
712 | |
---|
713 | |
---|
714 | /****************************************************************************** |
---|
715 | * |
---|
716 | * Function: unicode_array_to_UTF16 |
---|
717 | * |
---|
718 | * Description: Converts an array of unicode character numbers to a string |
---|
719 | * encoded by UTF-16 |
---|
720 | * |
---|
721 | * Parameters: unicode_array - the array containing unicode character numbers |
---|
722 | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) |
---|
723 | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) |
---|
724 | * |
---|
725 | * Returns: output - the UTF-16 encoded string representing the data |
---|
726 | * |
---|
727 | ******************************************************************************/ |
---|
728 | |
---|
729 | function unicode_array_to_UTF16( $unicode_array, $MSB_first ) |
---|
730 | { |
---|
731 | |
---|
732 | // Create a string to receive the UTF-16 output |
---|
733 | $output = ""; |
---|
734 | |
---|
735 | // Cycle through each Unicode character number |
---|
736 | foreach( $unicode_array as $unicode_char ) |
---|
737 | { |
---|
738 | // Check which range the current unicode character lies in |
---|
739 | if ( ( ( $unicode_char >= 0x0000 ) && ( $unicode_char <= 0xD7FF ) ) || |
---|
740 | ( ( $unicode_char >= 0xE000 ) && ( $unicode_char <= 0xFFFF ) ) ) |
---|
741 | { |
---|
742 | // Normal 16 Bit Character (Not a Surrogate Pair) |
---|
743 | |
---|
744 | // Check what byte order should be used |
---|
745 | if ( $MSB_first ) |
---|
746 | { |
---|
747 | // Big Endian |
---|
748 | $output .= chr( $unicode_char / 0x100 ) . chr( $unicode_char % 0x100 ) ; |
---|
749 | } |
---|
750 | else |
---|
751 | { |
---|
752 | // Little Endian |
---|
753 | $output .= chr( $unicode_char % 0x100 ) . chr( $unicode_char / 0x100 ) ; |
---|
754 | } |
---|
755 | |
---|
756 | } |
---|
757 | else if ( ( $unicode_char >= 0x10000 ) && ( $unicode_char <= 0x10FFFF ) ) |
---|
758 | { |
---|
759 | // Surrogate Pair required |
---|
760 | |
---|
761 | // Calculate Surrogates |
---|
762 | $High_Surrogate = ( ( $unicode_char - 0x10000 ) / 0x400 ) + 0xD800; |
---|
763 | $Low_Surrogate = ( ( $unicode_char - 0x10000 ) % 0x400 ) + 0xDC00; |
---|
764 | |
---|
765 | // Check what byte order should be used |
---|
766 | if ( $MSB_first ) |
---|
767 | { |
---|
768 | // Big Endian |
---|
769 | $output .= chr( $High_Surrogate / 0x100 ) . chr( $High_Surrogate % 0x100 ); |
---|
770 | $output .= chr( $Low_Surrogate / 0x100 ) . chr( $Low_Surrogate % 0x100 ); |
---|
771 | } |
---|
772 | else |
---|
773 | { |
---|
774 | // Little Endian |
---|
775 | $output .= chr( $High_Surrogate % 0x100 ) . chr( $High_Surrogate / 0x100 ); |
---|
776 | $output .= chr( $Low_Surrogate % 0x100 ) . chr( $Low_Surrogate / 0x100 ); |
---|
777 | } |
---|
778 | } |
---|
779 | else |
---|
780 | { |
---|
781 | // Invalid UTF-16 codepoint |
---|
782 | // Unicode value should never be between 0xD800 and 0xDFFF |
---|
783 | // Do not output this point - there is no way to encode it in UTF-16 |
---|
784 | } |
---|
785 | |
---|
786 | } |
---|
787 | |
---|
788 | // Return resulting UTF-16 String |
---|
789 | return $output; |
---|
790 | } |
---|
791 | |
---|
792 | /****************************************************************************** |
---|
793 | * End of Function: unicode_array_to_UTF16 |
---|
794 | ******************************************************************************/ |
---|
795 | |
---|
796 | |
---|
797 | |
---|
798 | |
---|
799 | |
---|
800 | /****************************************************************************** |
---|
801 | * |
---|
802 | * Function: xml_UTF8_clean |
---|
803 | * |
---|
804 | * Description: XML has specific requirements about the characters that are |
---|
805 | * allowed, and characters that must be escaped. |
---|
806 | * This function ensures that all characters in the given string |
---|
807 | * are valid, and that characters such as Quotes, Greater than, |
---|
808 | * Less than and Ampersand are properly escaped. Newlines and Tabs |
---|
809 | * are also escaped. |
---|
810 | * Note - Do not use this on constructed XML which includes tags, |
---|
811 | * as it will escape the tags. It is designed to be used |
---|
812 | * on the tag and attribute names, attribute values, and text. |
---|
813 | * |
---|
814 | * Parameters: utf8_text - a string containing the UTF-8 data |
---|
815 | * |
---|
816 | * Returns: output - the array containing the unicode character numbers |
---|
817 | * |
---|
818 | ******************************************************************************/ |
---|
819 | |
---|
820 | function xml_UTF8_clean( $UTF8_text ) |
---|
821 | { |
---|
822 | // Ensure that the Unicode UTF8 encoding is valid. |
---|
823 | |
---|
824 | $UTF8_text = UTF8_fix( $UTF8_text ); |
---|
825 | |
---|
826 | |
---|
827 | // XML only allows characters in the following unicode ranges |
---|
828 | // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] |
---|
829 | // Hence we need to delete any characters that dont fit this |
---|
830 | |
---|
831 | // Convert the UTF-8 string to an array of unicode character numbers |
---|
832 | $unicode_array = UTF8_to_unicode_array( $UTF8_text ); |
---|
833 | |
---|
834 | // Create a new array to receive the valid unicode character numbers |
---|
835 | $new_unicode_array = array( ); |
---|
836 | |
---|
837 | // Cycle through the unicode character numbers |
---|
838 | foreach( $unicode_array as $unichar ) |
---|
839 | { |
---|
840 | // Check if the unicode character number is valid for XML |
---|
841 | if ( ( $unichar == 0x09 ) || |
---|
842 | ( $unichar == 0x0A ) || |
---|
843 | ( $unichar == 0x0D ) || |
---|
844 | ( ( $unichar >= 0x20 ) && ( $unichar <= 0xD7FF ) ) || |
---|
845 | ( ( $unichar >= 0xE000 ) && ( $unichar <= 0xFFFD ) ) || |
---|
846 | ( ( $unichar >= 0x10000 ) && ( $unichar <= 0x10FFFF ) ) ) |
---|
847 | { |
---|
848 | // Unicode character is valid for XML - add it to the valid characters array |
---|
849 | $new_unicode_array[] = $unichar; |
---|
850 | } |
---|
851 | |
---|
852 | } |
---|
853 | |
---|
854 | // Convert the array of valid unicode character numbers back to UTF-8 encoded text |
---|
855 | $UTF8_text = unicode_array_to_UTF8( $new_unicode_array ); |
---|
856 | |
---|
857 | // Escape any special HTML characters present |
---|
858 | $UTF8_text = htmlspecialchars ( $UTF8_text, ENT_QUOTES ); |
---|
859 | |
---|
860 | // Escape CR, LF and TAB characters, so that they are kept and not treated as expendable white space |
---|
861 | $trans = array( "\x09" => "	", "\x0A" => "
", "\x0D" => "
" ); |
---|
862 | $UTF8_text = strtr( $UTF8_text, $trans ); |
---|
863 | |
---|
864 | // Return the resulting XML valid string |
---|
865 | return $UTF8_text; |
---|
866 | } |
---|
867 | |
---|
868 | /****************************************************************************** |
---|
869 | * End of Function: xml_UTF8_clean |
---|
870 | ******************************************************************************/ |
---|
871 | |
---|
872 | |
---|
873 | |
---|
874 | |
---|
875 | |
---|
876 | |
---|
877 | |
---|
878 | |
---|
879 | |
---|
880 | /****************************************************************************** |
---|
881 | * |
---|
882 | * Function: xml_UTF16_clean |
---|
883 | * |
---|
884 | * Description: XML has specific requirements about the characters that are |
---|
885 | * allowed, and characters that must be escaped. |
---|
886 | * This function ensures that all characters in the given string |
---|
887 | * are valid, and that characters such as Quotes, Greater than, |
---|
888 | * Less than and Ampersand are properly escaped. Newlines and Tabs |
---|
889 | * are also escaped. |
---|
890 | * Note - Do not use this on constructed XML which includes tags, |
---|
891 | * as it will escape the tags. It is designed to be used |
---|
892 | * on the tag and attribute names, attribute values, and text. |
---|
893 | * |
---|
894 | * Parameters: utf16_text - a string containing the UTF-16 data |
---|
895 | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) |
---|
896 | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) |
---|
897 | * |
---|
898 | * Returns: output - the array containing the unicode character numbers |
---|
899 | * |
---|
900 | ******************************************************************************/ |
---|
901 | |
---|
902 | function xml_UTF16_clean( $UTF16_text, $MSB_first ) |
---|
903 | { |
---|
904 | // Ensure that the Unicode UTF16 encoding is valid. |
---|
905 | |
---|
906 | $UTF16_text = UTF16_fix( $UTF16_text, $MSB_first ); |
---|
907 | |
---|
908 | |
---|
909 | // XML only allows characters in the following unicode ranges |
---|
910 | // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] |
---|
911 | // Hence we need to delete any characters that dont fit this |
---|
912 | |
---|
913 | // Convert the UTF-16 string to an array of unicode character numbers |
---|
914 | $unicode_array = UTF16_to_unicode_array( $UTF16_text, $MSB_first ); |
---|
915 | |
---|
916 | // Create a new array to receive the valid unicode character numbers |
---|
917 | $new_unicode_array = array( ); |
---|
918 | |
---|
919 | // Cycle through the unicode character numbers |
---|
920 | foreach( $unicode_array as $unichar ) |
---|
921 | { |
---|
922 | // Check if the unicode character number is valid for XML |
---|
923 | if ( ( $unichar == 0x09 ) || |
---|
924 | ( $unichar == 0x0A ) || |
---|
925 | ( $unichar == 0x0D ) || |
---|
926 | ( ( $unichar >= 0x20 ) && ( $unichar <= 0xD7FF ) ) || |
---|
927 | ( ( $unichar >= 0xE000 ) && ( $unichar <= 0xFFFD ) ) || |
---|
928 | ( ( $unichar >= 0x10000 ) && ( $unichar <= 0x10FFFF ) ) ) |
---|
929 | { |
---|
930 | // Unicode character is valid for XML - add it to the valid characters array |
---|
931 | $new_unicode_array[] = $unichar; |
---|
932 | } |
---|
933 | |
---|
934 | } |
---|
935 | |
---|
936 | // Convert the array of valid unicode character numbers back to UTF-16 encoded text |
---|
937 | $UTF16_text = unicode_array_to_UTF16( $new_unicode_array, $MSB_first ); |
---|
938 | |
---|
939 | // Escape any special HTML characters present |
---|
940 | $UTF16_text = htmlspecialchars ( $UTF16_text, ENT_QUOTES ); |
---|
941 | |
---|
942 | // Escape CR, LF and TAB characters, so that they are kept and not treated as expendable white space |
---|
943 | $trans = array( "\x09" => "	", "\x0A" => "
", "\x0D" => "
" ); |
---|
944 | $UTF16_text = strtr( $UTF16_text, $trans ); |
---|
945 | |
---|
946 | // Return the resulting XML valid string |
---|
947 | return $UTF16_text; |
---|
948 | } |
---|
949 | |
---|
950 | /****************************************************************************** |
---|
951 | * End of Function: xml_UTF16_clean |
---|
952 | ******************************************************************************/ |
---|
953 | |
---|
954 | |
---|
955 | |
---|
956 | |
---|
957 | |
---|
958 | |
---|
959 | /****************************************************************************** |
---|
960 | * |
---|
961 | * Function: HTML_UTF8_Escape |
---|
962 | * |
---|
963 | * Description: A HTML page can display UTF-8 data properly if it has a |
---|
964 | * META http-equiv="Content-Type" tag with the content attribute |
---|
965 | * including the value: "charset=utf-8". |
---|
966 | * Otherwise the ISO-8859-1 character set is usually assumed, and |
---|
967 | * Unicode values above 0x7F must be escaped. |
---|
968 | * This function takes a UTF-8 encoded string and escapes the |
---|
969 | * characters above 0x7F as well as reserved HTML characters such |
---|
970 | * as Quotes, Greater than, Less than and Ampersand. |
---|
971 | * |
---|
972 | * Parameters: utf8_text - a string containing the UTF-8 data |
---|
973 | * |
---|
974 | * Returns: htmloutput - a string containing the HTML equivalent |
---|
975 | * |
---|
976 | ******************************************************************************/ |
---|
977 | |
---|
978 | function HTML_UTF8_Escape( $UTF8_text ) |
---|
979 | { |
---|
980 | |
---|
981 | // Ensure that the Unicode UTF8 encoding is valid. |
---|
982 | $UTF8_text = UTF8_fix( $UTF8_text ); |
---|
983 | |
---|
984 | // Change: changed to use smart_htmlspecialchars, so that characters which were already escaped would remain intact, as of revision 1.10 |
---|
985 | // Escape any special HTML characters present |
---|
986 | $UTF8_text = smart_htmlspecialchars( $UTF8_text, ENT_QUOTES ); |
---|
987 | |
---|
988 | // Convert the UTF-8 string to an array of unicode character numbers |
---|
989 | $unicode_array = UTF8_to_unicode_array( $UTF8_text ); |
---|
990 | |
---|
991 | // Create a string to receive the escaped HTML |
---|
992 | $htmloutput = ""; |
---|
993 | |
---|
994 | // Cycle through the unicode character numbers |
---|
995 | foreach( $unicode_array as $unichar ) |
---|
996 | { |
---|
997 | // Check if the character needs to be escaped |
---|
998 | if ( ( $unichar >= 0x00 ) && ( $unichar <= 0x7F ) ) |
---|
999 | { |
---|
1000 | // Character is less than 0x7F - add it to the html as is |
---|
1001 | $htmloutput .= chr( $unichar ); |
---|
1002 | } |
---|
1003 | else |
---|
1004 | { |
---|
1005 | // Character is greater than 0x7F - escape it and add it to the html |
---|
1006 | $htmloutput .= "&#x" . dechex($unichar) . ";"; |
---|
1007 | } |
---|
1008 | } |
---|
1009 | |
---|
1010 | // Return the resulting escaped HTML |
---|
1011 | return $htmloutput; |
---|
1012 | } |
---|
1013 | |
---|
1014 | /****************************************************************************** |
---|
1015 | * End of Function: HTML_UTF8_Escape |
---|
1016 | ******************************************************************************/ |
---|
1017 | |
---|
1018 | |
---|
1019 | |
---|
1020 | /****************************************************************************** |
---|
1021 | * |
---|
1022 | * Function: HTML_UTF8_UnEscape |
---|
1023 | * |
---|
1024 | * Description: Converts HTML which contains escaped decimal or hex characters |
---|
1025 | * into UTF-8 text |
---|
1026 | * |
---|
1027 | * Parameters: HTML_text - a string containing the HTML text to convert |
---|
1028 | * |
---|
1029 | * Returns: utfoutput - a string containing the UTF-8 equivalent |
---|
1030 | * |
---|
1031 | ******************************************************************************/ |
---|
1032 | |
---|
1033 | function HTML_UTF8_UnEscape( $HTML_text ) |
---|
1034 | { |
---|
1035 | preg_match_all( "/\&\#(\d+);/", $HTML_text, $matches); |
---|
1036 | preg_match_all( "/\&\#[x|X]([A|B|C|D|E|F|a|b|c|d|e|f|0-9]+);/", $HTML_text, $hexmatches); |
---|
1037 | foreach( $hexmatches[1] as $index => $match ) |
---|
1038 | { |
---|
1039 | $matches[0][] = $hexmatches[0][$index]; |
---|
1040 | $matches[1][] = hexdec( $match ); |
---|
1041 | } |
---|
1042 | |
---|
1043 | for ( $i = 0; $i < count( $matches[ 0 ] ); $i++ ) |
---|
1044 | { |
---|
1045 | $trans = array( $matches[0][$i] => unicode_array_to_UTF8( array( $matches[1][$i] ) ) ); |
---|
1046 | |
---|
1047 | $HTML_text = strtr( $HTML_text , $trans ); |
---|
1048 | } |
---|
1049 | return $HTML_text; |
---|
1050 | } |
---|
1051 | |
---|
1052 | /****************************************************************************** |
---|
1053 | * End of Function: HTML_UTF8_UnEscape |
---|
1054 | ******************************************************************************/ |
---|
1055 | |
---|
1056 | |
---|
1057 | |
---|
1058 | |
---|
1059 | |
---|
1060 | |
---|
1061 | /****************************************************************************** |
---|
1062 | * |
---|
1063 | * Function: HTML_UTF16_Escape |
---|
1064 | * |
---|
1065 | * Description: A HTML page can display UTF-16 data properly if it has a |
---|
1066 | * META http-equiv="Content-Type" tag with the content attribute |
---|
1067 | * including the value: "charset=utf-16". |
---|
1068 | * Otherwise the ISO-8859-1 character set is usually assumed, and |
---|
1069 | * Unicode values above 0x7F must be escaped. |
---|
1070 | * This function takes a UTF-16 encoded string and escapes the |
---|
1071 | * characters above 0x7F as well as reserved HTML characters such |
---|
1072 | * as Quotes, Greater than, Less than and Ampersand. |
---|
1073 | * |
---|
1074 | * Parameters: utf16_text - a string containing the UTF-16 data |
---|
1075 | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) |
---|
1076 | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) |
---|
1077 | * |
---|
1078 | * Returns: htmloutput - a string containing the HTML equivalent |
---|
1079 | * |
---|
1080 | ******************************************************************************/ |
---|
1081 | |
---|
1082 | function HTML_UTF16_Escape( $UTF16_text, $MSB_first ) |
---|
1083 | { |
---|
1084 | |
---|
1085 | // Ensure that the Unicode UTF16 encoding is valid. |
---|
1086 | $UTF16_text = UTF16_fix( $UTF16_text, $MSB_first ); |
---|
1087 | |
---|
1088 | // Change: changed to use smart_htmlspecialchars, so that characters which were already escaped would remain intact, as of revision 1.10 |
---|
1089 | // Escape any special HTML characters present |
---|
1090 | $UTF16_text = smart_htmlspecialchars( $UTF16_text ); |
---|
1091 | |
---|
1092 | // Convert the UTF-16 string to an array of unicode character numbers |
---|
1093 | $unicode_array = UTF16_to_unicode_array( $UTF16_text, $MSB_first ); |
---|
1094 | |
---|
1095 | // Create a string to receive the escaped HTML |
---|
1096 | $htmloutput = ""; |
---|
1097 | |
---|
1098 | // Cycle through the unicode character numbers |
---|
1099 | foreach( $unicode_array as $unichar ) |
---|
1100 | { |
---|
1101 | // Check if the character needs to be escaped |
---|
1102 | if ( ( $unichar >= 0x00 ) && ( $unichar <= 0x7F ) ) |
---|
1103 | { |
---|
1104 | // Character is less than 0x7F - add it to the html as is |
---|
1105 | $htmloutput .= chr( $unichar ); |
---|
1106 | } |
---|
1107 | else |
---|
1108 | { |
---|
1109 | // Character is greater than 0x7F - escape it and add it to the html |
---|
1110 | $htmloutput .= "&#x" . dechex($unichar) . ";"; |
---|
1111 | } |
---|
1112 | } |
---|
1113 | |
---|
1114 | // Return the resulting escaped HTML |
---|
1115 | return $htmloutput; |
---|
1116 | } |
---|
1117 | |
---|
1118 | /****************************************************************************** |
---|
1119 | * End of Function: HTML_UTF16_Escape |
---|
1120 | ******************************************************************************/ |
---|
1121 | |
---|
1122 | |
---|
1123 | /****************************************************************************** |
---|
1124 | * |
---|
1125 | * Function: HTML_UTF16_UnEscape |
---|
1126 | * |
---|
1127 | * Description: Converts HTML which contains escaped decimal or hex characters |
---|
1128 | * into UTF-16 text |
---|
1129 | * |
---|
1130 | * Parameters: HTML_text - a string containing the HTML text to be converted |
---|
1131 | * MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first) |
---|
1132 | * False will cause processing as Little Endian UTF-16 (Intel, LSB first) |
---|
1133 | * |
---|
1134 | * Returns: utfoutput - a string containing the UTF-16 equivalent |
---|
1135 | * |
---|
1136 | ******************************************************************************/ |
---|
1137 | |
---|
1138 | function HTML_UTF16_UnEscape( $HTML_text, $MSB_first ) |
---|
1139 | { |
---|
1140 | $utf8_text = HTML_UTF8_UnEscape( $HTML_text ); |
---|
1141 | |
---|
1142 | return unicode_array_to_UTF16( UTF8_to_unicode_array( $utf8_text ), $MSB_first ); |
---|
1143 | } |
---|
1144 | |
---|
1145 | /****************************************************************************** |
---|
1146 | * End of Function: HTML_UTF16_UnEscape |
---|
1147 | ******************************************************************************/ |
---|
1148 | |
---|
1149 | |
---|
1150 | |
---|
1151 | |
---|
1152 | /****************************************************************************** |
---|
1153 | * |
---|
1154 | * Function: smart_HTML_Entities |
---|
1155 | * |
---|
1156 | * Description: Performs the same function as HTML_Entities, but leaves entities |
---|
1157 | * that are already escaped intact. |
---|
1158 | * |
---|
1159 | * Parameters: HTML_text - a string containing the HTML text to be escaped |
---|
1160 | * |
---|
1161 | * Returns: HTML_text_out - a string containing the escaped HTML text |
---|
1162 | * |
---|
1163 | ******************************************************************************/ |
---|
1164 | |
---|
1165 | function smart_HTML_Entities( $HTML_text ) |
---|
1166 | { |
---|
1167 | // Get a table containing the HTML entities translations |
---|
1168 | $translation_table = get_html_translation_table( HTML_ENTITIES ); |
---|
1169 | |
---|
1170 | // Change the ampersand to translate to itself, to avoid getting & |
---|
1171 | $translation_table[ chr(38) ] = '&'; |
---|
1172 | |
---|
1173 | // Perform replacements |
---|
1174 | // Regular expression says: find an ampersand, check the text after it, |
---|
1175 | // if the text after it is not one of the following, then replace the ampersand |
---|
1176 | // with & |
---|
1177 | // a) any combination of up to 4 letters (upper or lower case) with at least 2 or 3 non whitespace characters, then a semicolon |
---|
1178 | // b) a hash symbol, then between 2 and 7 digits |
---|
1179 | // c) a hash symbol, an 'x' character, then between 2 and 7 digits |
---|
1180 | // d) a hash symbol, an 'X' character, then between 2 and 7 digits |
---|
1181 | return preg_replace( "/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,7}|#x[0-9]{2,7}|#X[0-9]{2,7};)/","&" , strtr( $HTML_text, $translation_table ) ); |
---|
1182 | } |
---|
1183 | |
---|
1184 | /****************************************************************************** |
---|
1185 | * End of Function: smart_HTML_Entities |
---|
1186 | ******************************************************************************/ |
---|
1187 | |
---|
1188 | |
---|
1189 | |
---|
1190 | /****************************************************************************** |
---|
1191 | * |
---|
1192 | * Function: smart_htmlspecialchars |
---|
1193 | * |
---|
1194 | * Description: Performs the same function as htmlspecialchars, but leaves characters |
---|
1195 | * that are already escaped intact. |
---|
1196 | * |
---|
1197 | * Parameters: HTML_text - a string containing the HTML text to be escaped |
---|
1198 | * |
---|
1199 | * Returns: HTML_text_out - a string containing the escaped HTML text |
---|
1200 | * |
---|
1201 | ******************************************************************************/ |
---|
1202 | |
---|
1203 | function smart_htmlspecialchars( $HTML_text ) |
---|
1204 | { |
---|
1205 | // Get a table containing the HTML special characters translations |
---|
1206 | $translation_table=get_html_translation_table (HTML_SPECIALCHARS); |
---|
1207 | |
---|
1208 | // Change the ampersand to translate to itself, to avoid getting & |
---|
1209 | $translation_table[ chr(38) ] = '&'; |
---|
1210 | |
---|
1211 | // Perform replacements |
---|
1212 | // Regular expression says: find an ampersand, check the text after it, |
---|
1213 | // if the text after it is not one of the following, then replace the ampersand |
---|
1214 | // with & |
---|
1215 | // a) any combination of up to 4 letters (upper or lower case) with at least 2 or 3 non whitespace characters, then a semicolon |
---|
1216 | // b) a hash symbol, then between 2 and 7 digits |
---|
1217 | // c) a hash symbol, an 'x' character, then between 2 and 7 digits |
---|
1218 | // d) a hash symbol, an 'X' character, then between 2 and 7 digits |
---|
1219 | return preg_replace( "/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,7}|#x[0-9]{2,7}|#X[0-9]{2,7};)/","&" , strtr( $HTML_text, $translation_table ) ); |
---|
1220 | } |
---|
1221 | |
---|
1222 | /****************************************************************************** |
---|
1223 | * End of Function: smart_htmlspecialchars |
---|
1224 | ******************************************************************************/ |
---|
1225 | |
---|
1226 | |
---|
1227 | ?> |
---|