Simpletest Coverage - includes/unicode.inc

1 <?php
2 // $Id: unicode.inc,v 1.39 2009/05/24 17:39:30 dries Exp $
3
4 /**
5 * Indicates an error during check for PHP unicode support.
6 */
7 define('UNICODE_ERROR', -1);
8
9 /**
10 * Indicates that standard PHP (emulated) unicode support is being used.
11 */
12 define('UNICODE_SINGLEBYTE', 0);
13
14 /**
15 * Indicates that full unicode support with the PHP mbstring extension is being
16 * used.
17 */
18 define('UNICODE_MULTIBYTE', 1);
19
20 /**
21 * Wrapper around _unicode_check().
22 */
23 function unicode_check() {
24 list($GLOBALS['multibyte']) = _unicode_check();
25 }
26
27 /**
28 * Perform checks about Unicode support in PHP, and set the right settings if
29 * needed.
30 *
31 * Because Drupal needs to be able to handle text in various encodings, we do
32 * not support mbstring function overloading. HTTP input/output conversion must
33 * be disabled for similar reasons.
34 *
35 * @param $errors
36 * Whether to report any fatal errors with form_set_error().
37 */
38 function _unicode_check() {
39 // Ensure translations don't break at install time
40 $t = get_t();
41
42 // Set the standard C locale to ensure consistent, ASCII-only string handling.
43 setlocale(LC_CTYPE, 'C');
44
45 // Check for mbstring extension
46 if (!function_exists('mb_strlen')) {
47 return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));
48 }
49
50 // Check mbstring configuration
51 if (ini_get('mbstring.func_overload') != 0) {
52 return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
53 }
54 if (ini_get('mbstring.encoding_translation') != 0) {
55 return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
56 }
57 if (ini_get('mbstring.http_input') != 'pass') {
58 return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
59 }
60 if (ini_get('mbstring.http_output') != 'pass') {
61 return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));
62 }
63
64 // Set appropriate configuration
65 mb_internal_encoding('utf-8');
66 mb_language('uni');
67 return array(UNICODE_MULTIBYTE, '');
68 }
69
70 /**
71 * Return Unicode library status and errors.
72 */
73 function unicode_requirements() {
74 // Ensure translations don't break at install time
75 $t = get_t();
76
77 $libraries = array(
78 UNICODE_SINGLEBYTE => $t('Standard PHP'),
79 UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
80 UNICODE_ERROR => $t('Error'),
81 );
82 $severities = array(
83 UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
84 UNICODE_MULTIBYTE => REQUIREMENT_OK,
85 UNICODE_ERROR => REQUIREMENT_ERROR,
86 );
87 list($library, $description) = _unicode_check();
88
89 $requirements['unicode'] = array(
90 'title' => $t('Unicode library'),
91 'value' => $libraries[$library],
92 );
93 if ($description) {
94 $requirements['unicode']['description'] = $description;
95 }
96
97 $requirements['unicode']['severity'] = $severities[$library];
98
99 return $requirements;
100 }
101
102 /**
103 * Prepare a new XML parser.
104 *
105 * This is a wrapper around xml_parser_create() which extracts the encoding from
106 * the XML data first and sets the output encoding to UTF-8. This function should
107 * be used instead of xml_parser_create(), because PHP 4's XML parser doesn't
108 * check the input encoding itself. "Starting from PHP 5, the input encoding is
109 * automatically detected, so that the encoding parameter specifies only the
110 * output encoding."
111 *
112 * This is also where unsupported encodings will be converted. Callers should
113 * take this into account: $data might have been changed after the call.
114 *
115 * @param &$data
116 * The XML data which will be parsed later.
117 * @return
118 * An XML parser object or FALSE on error.
119 */
120 function drupal_xml_parser_create(&$data) {
121 // Default XML encoding is UTF-8
122 $encoding = 'utf-8';
123 $bom = FALSE;
124
125 // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).
126 if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
127 $bom = TRUE;
128 $data = substr($data, 3);
129 }
130
131 // Check for an encoding declaration in the XML prolog if no BOM was found.
132 if (!$bom && preg_match('/^<\?xml[^>]+encoding="(.+?)"/', $data, $match)) {
133 $encoding = $match[1];
134 }
135
136 // Unsupported encodings are converted here into UTF-8.
137 $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
138 if (!in_array(strtolower($encoding), $php_supported)) {
139 $out = drupal_convert_to_utf8($data, $encoding);
140 if ($out !== FALSE) {
141 $encoding = 'utf-8';
142 $data = preg_replace('/^(<\?xml[^>]+encoding)="(.+?)"/', '\\1="utf-8"', $out);
143 }
144 else {
145 watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING);
146 return FALSE;
147 }
148 }
149
150 $xml_parser = xml_parser_create($encoding);
151 xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
152 return $xml_parser;
153 }
154
155 /**
156 * Convert data to UTF-8
157 *
158 * Requires the iconv, GNU recode or mbstring PHP extension.
159 *
160 * @param $data
161 * The data to be converted.
162 * @param $encoding
163 * The encoding that the data is in
164 * @return
165 * Converted data or FALSE.
166 */
167 function drupal_convert_to_utf8($data, $encoding) {
168 if (function_exists('iconv')) {
169 $out = @iconv($encoding, 'utf-8', $data);
170 }
171 elseif (function_exists('mb_convert_encoding')) {
172 $out = @mb_convert_encoding($data, 'utf-8', $encoding);
173 }
174 elseif (function_exists('recode_string')) {
175 $out = @recode_string($encoding . '..utf-8', $data);
176 }
177 else {
178 watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
179 return FALSE;
180 }
181
182 return $out;
183 }
184
185 /**
186 * Truncate a UTF-8-encoded string safely to a number of bytes.
187 *
188 * If the end position is in the middle of a UTF-8 sequence, it scans backwards
189 * until the beginning of the byte sequence.
190 *
191 * Use this function whenever you want to chop off a string at an unsure
192 * location. On the other hand, if you're sure that you're splitting on a
193 * character boundary (e.g. after using strpos() or similar), you can safely use
194 * substr() instead.
195 *
196 * @param $string
197 * The string to truncate.
198 * @param $len
199 * An upper limit on the returned string length.
200 * @return
201 * The truncated string.
202 */
203 function drupal_truncate_bytes($string, $len) {
204 if (strlen($string) <= $len) {
205 return $string;
206 }
207 if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
208 return substr($string, 0, $len);
209 }
210 // Scan backwards to beginning of the byte sequence.
211 while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0);
212
213 return substr($string, 0, $len);
214 }
215
216 /**
217 * Truncate a UTF-8-encoded string safely to a number of characters.
218 *
219 * @param $string
220 * The string to truncate.
221 * @param $len
222 * An upper limit on the returned string length.
223 * @param $wordsafe
224 * Flag to truncate at last space within the upper limit. Defaults to FALSE.
225 * @param $dots
226 * Flag to add trailing dots. Defaults to FALSE.
227 * @return
228 * The truncated string.
229 */
230 function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
231
232 if (drupal_strlen($string) <= $len) {
233 return $string;
234 }
235
236 if ($dots) {
237 $len -= 4;
238 }
239
240 if ($wordsafe) {
241 $string = drupal_substr($string, 0, $len + 1); // leave one more character
242 if ($last_space = strrpos($string, ' ')) { // space exists AND is not on position 0
243 $string = substr($string, 0, $last_space);
244 }
245 else {
246 $string = drupal_substr($string, 0, $len);
247 }
248 }
249 else {
250 $string = drupal_substr($string, 0, $len);
251 }
252
253 if ($dots) {
254 $string .= ' ...';
255 }
256
257 return $string;
258 }
259
260 /**
261 * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
262 * characters.
263 *
264 * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".
265 *
266 * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
267 *
268 * Notes:
269 * - Only encode strings that contain non-ASCII characters.
270 * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure
271 * each chunk starts and ends on a character boundary.
272 * - Using \n as the chunk separator may cause problems on some systems and may
273 * have to be changed to \r\n or \r.
274 */
275 function mime_header_encode($string) {
276 if (preg_match('/[^\x20-\x7E]/', $string)) {
277 $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
278 $len = strlen($string);
279 $output = '';
280 while ($len > 0) {
281 $chunk = drupal_truncate_bytes($string, $chunk_size);
282 $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n";
283 $c = strlen($chunk);
284 $string = substr($string, $c);
285 $len -= $c;
286 }
287 return trim($output);
288 }
289 return $string;
290 }
291
292 /**
293 * Complement to mime_header_encode
294 */
295 function mime_header_decode($header) {
296 // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)
297 $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);
298 // Second step: remaining chunks (do not collapse whitespace)
299 return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header);
300 }
301
302 /**
303 * Helper function to mime_header_decode
304 */
305 function _mime_header_decode($matches) {
306 // Regexp groups:
307 // 1: Character set name
308 // 2: Escaping method (Q or B)
309 // 3: Encoded data
310 $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));
311 if (strtolower($matches[1]) != 'utf-8') {
312 $data = drupal_convert_to_utf8($data, $matches[1]);
313 }
314 return $data;
315 }
316
317 /**
318 * Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.
319 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;", not "<").
320 *
321 * @param $text
322 * The text to decode entities in.
323 * @param $exclude
324 * An array of characters which should not be decoded. For example,
325 * array('<', '&', '"'). This affects both named and numerical entities.
326 */
327 function decode_entities($text, $exclude = array()) {
328 static $html_entities;
329 if (!isset($html_entities)) {
330 include DRUPAL_ROOT . '/includes/unicode.entities.inc';
331 }
332
333 // Flip the exclude list so that we can do quick lookups later.
334 $exclude = array_flip($exclude);
335
336 // Use a regexp to select all entities in one pass, to avoid decoding
337 // double-escaped entities twice. The PREG_REPLACE_EVAL modifier 'e' is
338 // being used to allow for a callback (see
339 // http://php.net/manual/en/reference.pcre.pattern.modifiers).
340 return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1", "$2", "$0", $html_entities, $exclude)', $text);
341 }
342
343 /**
344 * Helper function for decode_entities
345 */
346 function _decode_entities($prefix, $codepoint, $original, &$html_entities, &$exclude) {
347 // Named entity
348 if (!$prefix) {
349 // A named entity not in the exclude list.
350 if (isset($html_entities[$original]) && !isset($exclude[$html_entities[$original]])) {
351 return $html_entities[$original];
352 }
353 else {
354 return $original;
355 }
356 }
357 // Hexadecimal numerical entity
358 if ($prefix == '#x') {
359 $codepoint = base_convert($codepoint, 16, 10);
360 }
361 // Decimal numerical entity (strip leading zeros to avoid PHP octal notation)
362 else {
363 $codepoint = preg_replace('/^0+/', '', $codepoint);
364 }
365 // Encode codepoint as UTF-8 bytes
366 if ($codepoint < 0x80) {
367 $str = chr($codepoint);
368 }
369 elseif ($codepoint < 0x800) {
370 $str = chr(0xC0 | ($codepoint >> 6))
371 . chr(0x80 | ($codepoint & 0x3F));
372 }
373 elseif ($codepoint < 0x10000) {
374 $str = chr(0xE0 | ( $codepoint >> 12))
375 . chr(0x80 | (($codepoint >> 6) & 0x3F))
376 . chr(0x80 | ( $codepoint & 0x3F));
377 }
378 elseif ($codepoint < 0x200000) {
379 $str = chr(0xF0 | ( $codepoint >> 18))
380 . chr(0x80 | (($codepoint >> 12) & 0x3F))
381 . chr(0x80 | (($codepoint >> 6) & 0x3F))
382 . chr(0x80 | ( $codepoint & 0x3F));
383 }
384 // Check for excluded characters
385 if (isset($exclude[$str])) {
386 return $original;
387 }
388 else {
389 return $str;
390 }
391 }
392
393 /**
394 * Count the amount of characters in a UTF-8 string. This is less than or
395 * equal to the byte count.
396 */
397 function drupal_strlen($text) {
398 global $multibyte;
399 if ($multibyte == UNICODE_MULTIBYTE) {
400 return mb_strlen($text);
401 }
402 else {
403 // Do not count UTF-8 continuation bytes.
404 return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
405 }
406 }
407
408 /**
409 * Uppercase a UTF-8 string.
410 */
411 function drupal_strtoupper($text) {
412 global $multibyte;
413 if ($multibyte == UNICODE_MULTIBYTE) {
414 return mb_strtoupper($text);
415 }
416 else {
417 // Use C-locale for ASCII-only uppercase
418 $text = strtoupper($text);
419 // Case flip Latin-1 accented letters
420 $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);
421 return $text;
422 }
423 }
424
425 /**
426 * Lowercase a UTF-8 string.
427 */
428 function drupal_strtolower($text) {
429 global $multibyte;
430 if ($multibyte == UNICODE_MULTIBYTE) {
431 return mb_strtolower($text);
432 }
433 else {
434 // Use C-locale for ASCII-only lowercase
435 $text = strtolower($text);
436 // Case flip Latin-1 accented letters
437 $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);
438 return $text;
439 }
440 }
441
442 /**
443 * Helper function for case conversion of Latin-1.
444 * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
445 */
446 function _unicode_caseflip($matches) {
447 return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
448 }
449
450 /**
451 * Capitalize the first letter of a UTF-8 string.
452 */
453 function drupal_ucfirst($text) {
454 // Note: no mbstring equivalent!
455 return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);
456 }
457
458 /**
459 * Cut off a piece of a string based on character indices and counts. Follows
460 * the same behavior as PHP's own substr() function.
461 *
462 * Note that for cutting off a string at a known character/substring
463 * location, the usage of PHP's normal strpos/substr is safe and
464 * much faster.
465 */
466 function drupal_substr($text, $start, $length = NULL) {
467 global $multibyte;
468 if ($multibyte == UNICODE_MULTIBYTE) {
469 return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);
470 }
471 else {
472 $strlen = strlen($text);
473 // Find the starting byte offset.
474 $bytes = 0;
475 if ($start > 0) {
476 // Count all the continuation bytes from the start until we have found
477 // $start characters or the end of the string.
478 $bytes = -1; $chars = -1;
479 while ($bytes < $strlen - 1 && $chars < $start) {
480 $bytes++;
481 $c = ord($text[$bytes]);
482 if ($c < 0x80 || $c >= 0xC0) {
483 $chars++;
484 }
485 }
486 }
487 elseif ($start < 0) {
488 // Count all the continuation bytes from the end until we have found
489 // abs($start) characters.
490 $start = abs($start);
491 $bytes = $strlen; $chars = 0;
492 while ($bytes > 0 && $chars < $start) {
493 $bytes--;
494 $c = ord($text[$bytes]);
495 if ($c < 0x80 || $c >= 0xC0) {
496 $chars++;
497 }
498 }
499 }
500 $istart = $bytes;
501
502 // Find the ending byte offset.
503 if ($length === NULL) {
504 $iend = $strlen;
505 }
506 elseif ($length > 0) {
507 // Count all the continuation bytes from the starting index until we have
508 // found $length characters or reached the end of the string, then
509 // backtrace one byte.
510 $iend = $istart - 1; $chars = -1;
511 while ($iend < $strlen - 1 && $chars < $length) {
512 $iend++;
513 $c = ord($text[$iend]);
514 if ($c < 0x80 || $c >= 0xC0) {
515 $chars++;
516 }
517 }
518 // Backtrace one byte if the end of the string was not reached.
519 if ($iend < $strlen - 1) {
520 $iend--;
521 }
522 }
523 elseif ($length < 0) {
524 // Count all the continuation bytes from the end until we have found
525 // abs($start) characters, then backtrace one byte.
526 $length = abs($length);
527 $iend = $strlen; $chars = 0;
528 while ($iend > 0 && $chars < $length) {
529 $iend--;
530 $c = ord($text[$iend]);
531 if ($c < 0x80 || $c >= 0xC0) {
532 $chars++;
533 }
534 }
535 // Backtrace one byte if we are not at the begining of the string.
536 if ($iend > 0) {
537 $iend--;
538 }
539 }
540 else {
541 // $length == 0, return an empty string.
542 $iend = $istart - 1;
543 }
544
545 return substr($text, $istart, max(0, $iend - $istart + 1));
546 }
547 }
548
549
550

Legend

Missed
lines code that were not excersized during program execution.
Covered
lines code were excersized during program execution.
Comment/non executable
Comment or non-executable line of code.
Dead
lines of code that according to xdebug could not be executed. This is counted as coverage code because in almost all cases it is code that runnable.