Patch Name: Jam Asian Text Supporting patch Patch URI: http://www.pobox.com/~jam/unix/wordpress/#patches Description: A patch to modify WordPress 2.0.5 to support Asian text in its excerpt constructing functions. Version: 1.4 Author: Kazushi (Jam) Marukawa Author URI: http://www.pobox.com/~jam/ How to install this: 1. Apply this patch. 2. That's all. Index: ./wp-includes/comment-functions.php =================================================================== --- ./wp-includes/comment-functions.php +++ ./wp-includes/comment-functions.php (working copy) @@ -448,19 +448,7 @@ function get_comment_excerpt() { global $comment; $comment_text = strip_tags($comment->comment_content); - $blah = explode(' ', $comment_text); - if (count($blah) > 20) { - $k = 20; - $use_dotdotdot = 1; - } else { - $k = count($blah); - $use_dotdotdot = 0; - } - $excerpt = ''; - for ($i=0; $i<$k; $i++) { - $excerpt .= $blah[$i] . ' '; - } - $excerpt .= ($use_dotdotdot) ? '...' : ''; + $excerpt = jamul_create_simple_excerpt($comment_text, 20); return apply_filters('get_comment_excerpt', $excerpt); } Index: ./wp-includes/feed-functions.php =================================================================== --- ./wp-includes/feed-functions.php +++ ./wp-includes/feed-functions.php (working copy) @@ -35,19 +35,7 @@ $content = strip_tags($content); } if ($cut) { - $blah = explode(' ', $content); - if (count($blah) > $cut) { - $k = $cut; - $use_dotdotdot = 1; - } else { - $k = count($blah); - $use_dotdotdot = 0; - } - for ($i=0; $i<$k; $i++) { - $excerpt .= $blah[$i].' '; - } - $excerpt .= ($use_dotdotdot) ? '...' : ''; - $content = $excerpt; + $content = jamul_create_simple_excerpt($content, $cut); } $content = str_replace(']]>', ']]>', $content); echo wpj_utf8_encode($content); Index: ./wp-includes/functions-formatting.php =================================================================== --- ./wp-includes/functions-formatting.php +++ ./wp-includes/functions-formatting.php (working copy) @@ -754,13 +754,7 @@ $text = apply_filters('the_content', $text); $text = str_replace(']]>', ']]>', $text); $text = strip_tags($text); - $excerpt_length = 55; - $words = explode(' ', $text, $excerpt_length + 1); - if (count($words) > $excerpt_length) { - array_pop($words); - array_push($words, '[...]'); - $text = implode(' ', $words); - } + $text = jamul_trim_excerpt($text, 55); } return $text; } Index: ./wp-includes/functions-post.php =================================================================== --- ./wp-includes/functions-post.php +++ ./wp-includes/functions-post.php (working copy) @@ -625,7 +625,7 @@ $excerpt = strip_tags($post_excerpt?$post_excerpt:$post_content); if (strlen($excerpt) > 255) { - $excerpt = substr($excerpt,0,252) . '...'; + $excerpt = jamul_substr($excerpt,0,252) . '...'; } $trackback_urls = explode(',', $tb_list); @@ -712,7 +712,7 @@ if ( function_exists('mb_strcut') ) // For international trackbacks $excerpt = mb_strcut($excerpt, 0, 252, get_settings('blog_charset')) . '...'; else - $excerpt = mb_substr($excerpt, 0, 84) . '...'; + $excerpt = jamul_substr($excerpt, 0, 252) . '...'; $post_title = apply_filters('the_title', $post->post_title); $post_title = strip_tags($post_title); Index: ./wp-includes/functions.php =================================================================== --- ./wp-includes/functions.php +++ ./wp-includes/functions.php (working copy) @@ -2923,4 +2923,193 @@ return wpj_utf8_encode(apply_filters('wpj_comment_author', $author)); } +/* +Plugin Name: Jam UTF8 Library +Plugin URI: http://www.pobox.com/~jam/unix/wordpress/ +Description: Define several UTF8 friendly functions for developers. +Version: 1.3 +Author: Kazushi (Jam) Marukawa +Author URI: http://www.pobox.com/~jam/ + +*/ + +// The length of longest one word in average. If the length of string +// after trimming is longer than #word * the value below, trimming +// function believes trimmed string doesn't contain whitespaces and +// starts trimming it again using the length of string. +define("JAMUL_LEN_MAX1WORD", 8); + +// The length of one word in average. Use this value to calculate +// the length of trimmed string. +define("JAMUL_LEN_1WORD", 5); + +// The number of bytes used when WordPress looking around to find delimiters +// (either a whitespace or a point where ASCII and other character switched). +// This also represents the number of bytes of few characters. +define("JAMUL_LEN_SEARCH", 15); + +function jamul_is_too_long($string, $count) { + if (strlen($string) > $count * JAMUL_LEN_MAX1WORD) { + return true; + } + return false; +} + +function jamul_find_1stbyte($string, $pos=0, $stop=-1) { + $len = strlen($string); + if ($stop < 0 || $stop > $len) { + $stop = $len; + } + for (; $pos < $stop; $pos++) { + if ((ord($string[$pos]) < 0x80) || (ord($string[$pos]) >= 0xC0)) { + break; // find 1st byte of multi-byte characters. + } + } + return $pos; +} + +function jamul_find_1stbyte_reverse($string, $pos=-1, $stop=0) { + $len = strlen($string); + if ($pos < 0 || $pos >= $len) { + $pos = $len - 1; + } + for (; $pos >= $stop; $pos--) { + if ((ord($string[$pos]) < 0x80) || (ord($string[$pos]) >= 0xC0)) { + break; // find 1st byte of multi-byte characters. + } + } + return $pos; +} + +function jamul_find_delimiter($string, $pos=0, $min = -1, $max=-1) { + $len = strlen($string); + if ($pos == 0 || $pos < 0 || $pos >= $len) { + return $pos; + } + if ($min < 0) { + $min = max(0, $pos - JAMUL_LEN_SEARCH); + } + if ($max < 0 || $max >= $len) { + $max = min($len - 1, $pos + JAMUL_LEN_SEARCH); + } + if (ord($string[$pos]) < 0x80) { + // Found ASCII character at the trimming point. So, trying + // to find new trimming point around $pos. New trimming point + // should be on a whitespace or the transition from ASCII to + // other character. + $pos3 = -1; + for ($pos2 = $pos; $pos2 <= $max; $pos2++) { + if ($string[$pos2] == ' ') { + break; + } else if ($pos3 < 0 && ord($string[$pos2]) >= 0x80) { + $pos3 = $pos2; + } + } + if ($pos2 > $max && $pos3 >= 0) { + $pos2 = $pos3; + } + if ($pos2 > $max) { + $pos3 = -1; + for ($pos2 = $pos; $pos2 >= $min; $pos2--) { + if ($string[$pos2] == ' ') { + break; + } else if ($pos3 < 0 && ord($string[$pos2]) >= 0x80) { + $pos3 = $pos2 + 1; + } + } + if ($pos2 < $min && $pos3 >= 0) { + $pos2 = $pos3; + } + } + if ($pos2 <= $max && $pos2 >= $min) { + $pos = $pos2; + } + } else if ((ord($string[$pos]) >= 0x80) || (ord($string[$pos]) < 0xC0)) { + $pos = jamul_find_1stbyte($string, $pos, $max); + } + return $pos; +} + +function jamul_truncate($string, $byte) { + $len = strlen($string); + if ($len <= $byte) + return $string; + $byte = jamul_find_1stbyte_reverse($string, $byte); + return substr($string, 0, $byte); +} + +function jamul_substr($string, $start, $byte=-1) { + // mb_substr returns multi-byte characters' string with $len characters. + // However, what application needs is a string with multi-byte characters + // within $byte bytes data. So, I implement it. + $len = strlen($string); + if ($start < 0) { + $start = $len - $start; + } + if ($byte < 0) { + return substr($string, $start); + } else if ($byte > $len - $start) { + $byte = $len - $start; + } + $byte = jamul_find_1stbyte_reverse($string, $start + $byte, $start) + - $start; + return substr($string, $start, $byte); +} + +function jamul_substr_word($string, $start, $word=-1) { + $len = strlen($string); + if ($start < 0) { + $start = $len - $start; + } + if ($word < 0) { + return substr($string, $start); + } + $byte = $word * JAMUL_LEN_1WORD; + if ($byte >= $len - $start) { + return substr($string, $start); + } + $pos = jamul_find_1stbyte_reverse($string, $start + $byte, $start); + $pos = jamul_find_delimiter($string, $pos); + return substr($string, $start, $pos - $start); +} + +function jamul_create_simple_excerpt($string, $count) { + $blah = explode(' ', $string); + if (count($blah) > $count) { + $k = $count; + $use_dotdotdot = 1; + } else { + $k = count($blah); + $use_dotdotdot = 0; + } + $excerpt = ''; + for ($i=0; $i<$k; $i++) { + $excerpt .= $blah[$i] . ' '; + } + $excerpt .= ($use_dotdotdot) ? '...' : ''; + if (jamul_is_too_long($excerpt, $count)) { + // too many bytes in $excerpt... this happens if $string doesn't + // contain whitespaces (common in Asian text). + return jamul_substr_word($string, 0, $count) . " ..."; + } + return $excerpt; +} + +function jamul_trim_excerpt($string, $count) { + $words = explode(' ', $string, $count + 1); + if (count($words) > $count) { + array_pop($words); + array_push($words, '[...]'); + $text = implode(' ', $words); + } else { + $text = $string; + } + if (jamul_is_too_long($text, $count)) { + // too many bytes in $text... this happens if $string doesn't + // contain whitespaces (common in Asian text). + return jamul_substr_word($string, 0, $count) . " [...]"; + } + return $text; +} + ?> Index: ./wp-trackback.php =================================================================== --- ./wp-trackback.php +++ ./wp-trackback.php (working copy) @@ -97,8 +97,8 @@ $excerpt = mb_substr($excerpt, 0, 84, get_settings('blog_charset')) . ' (more...)'; $title = mb_substr($title, 0, 30, get_settings('blog_charset')) . '...'; } else { - $excerpt = (strlen($excerpt) > 255) ? substr($excerpt, 0, 252) . '...' : $excerpt; - $title = (strlen($title) > 250) ? substr($title, 0, 250) . '...' : $title; + $excerpt = (strlen($excerpt) > 255) ? jamul_substr($excerpt, 0, 252) . '...' : $excerpt; + $title = (strlen($title) > 250) ? jamul_substr($title, 0, 250) . '...' : $title; } $comment_post_ID = $tb_id;