소스 검색

[sluggable] improved transliterator, closes #65, references #27

gediminasm 14 년 전
부모
커밋
6251145001
3개의 변경된 파일150개의 추가작업 그리고 61개의 파일을 삭제
  1. 128 0
      lib/Gedmo/Sluggable/Util/LICENSE
  2. 22 61
      lib/Gedmo/Sluggable/Util/Urlizer.php
  3. BIN
      lib/Gedmo/Sluggable/Util/data/x00.php

+ 128 - 0
lib/Gedmo/Sluggable/Util/LICENSE

@@ -0,0 +1,128 @@
+			 The "Artistic License"
+
+				Preamble
+
+The intent of this document is to state the conditions under which a
+Package may be copied, such that the Copyright Holder maintains some
+semblance of artistic control over the development of the package,
+while giving the users of the package the right to use and distribute
+the Package in a more-or-less customary fashion, plus the right to make
+reasonable modifications.
+
+Definitions:
+
+	"Package" refers to the collection of files distributed by the
+	Copyright Holder, and derivatives of that collection of files
+	created through textual modification.
+
+	"Standard Version" refers to such a Package if it has not been
+	modified, or has been modified in accordance with the wishes
+	of the Copyright Holder as specified below.
+
+	"Copyright Holder" is whoever is named in the copyright or
+	copyrights for the package.
+
+	"You" is you, if you're thinking about copying or distributing
+	this Package.
+
+	"Reasonable copying fee" is whatever you can justify on the
+	basis of media cost, duplication charges, time of people involved,
+	and so on.  (You will not be required to justify it to the
+	Copyright Holder, but only to the computing community at large
+	as a market that must bear the fee.)
+
+	"Freely Available" means that no fee is charged for the item
+	itself, though there may be fees involved in handling the item.
+	It also means that recipients of the item may redistribute it
+	under the same conditions they received it.
+
+1. You may make and give away verbatim copies of the source form of the
+Standard Version of this Package without restriction, provided that you
+duplicate all of the original copyright notices and associated disclaimers.
+
+2. You may apply bug fixes, portability fixes and other modifications
+derived from the Public Domain or from the Copyright Holder.  A Package
+modified in such a way shall still be considered the Standard Version.
+
+3. You may otherwise modify your copy of this Package in any way, provided
+that you insert a prominent notice in each changed file stating how and
+when you changed that file, and provided that you do at least ONE of the
+following:
+
+    a) place your modifications in the Public Domain or otherwise make them
+    Freely Available, such as by posting said modifications to Usenet or
+    an equivalent medium, or placing the modifications on a major archive
+    site such as uunet.uu.net, or by allowing the Copyright Holder to include
+    your modifications in the Standard Version of the Package.
+
+    b) use the modified Package only within your corporation or organization.
+
+    c) rename any non-standard executables so the names do not conflict
+    with standard executables, which must also be provided, and provide
+    a separate manual page for each non-standard executable that clearly
+    documents how it differs from the Standard Version.
+
+    d) make other distribution arrangements with the Copyright Holder.
+
+4. You may distribute the programs of this Package in object code or
+executable form, provided that you do at least ONE of the following:
+
+    a) distribute a Standard Version of the executables and library files,
+    together with instructions (in the manual page or equivalent) on where
+    to get the Standard Version.
+
+    b) accompany the distribution with the machine-readable source of
+    the Package with your modifications.
+
+    c) give non-standard executables non-standard names, and clearly
+    document the differences in manual pages (or equivalent), together
+    with instructions on where to get the Standard Version.
+
+    d) make other distribution arrangements with the Copyright Holder.
+
+5. You may charge a reasonable copying fee for any distribution of this
+Package.  You may charge any fee you choose for support of this
+Package.  You may not charge a fee for this Package itself.  However,
+you may distribute this Package in aggregate with other (possibly
+commercial) programs as part of a larger (possibly commercial) software
+distribution provided that you do not advertise this Package as a
+product of your own.  You may embed this Package's interpreter within
+an executable of yours (by linking); this shall be construed as a mere
+form of aggregation, provided that the complete Standard Version of the
+interpreter is so embedded.
+
+6. The scripts and library files supplied as input to or produced as
+output from the programs of this Package do not automatically fall
+under the copyright of this Package, but belong to whoever generated
+them, and may be sold commercially, and may be aggregated with this
+Package.  If such scripts or library files are aggregated with this
+Package via the so-called "undump" or "unexec" methods of producing a
+binary executable image, then distribution of such an image shall
+neither be construed as a distribution of this Package nor shall it
+fall under the restrictions of Paragraphs 3 and 4, provided that you do
+not represent such an executable image as a Standard Version of this
+Package.
+
+7. C subroutines (or comparably compiled subroutines in other
+languages) supplied by you and linked into this Package in order to
+emulate subroutines and variables of the language defined by this
+Package shall not be considered part of this Package, but are the
+equivalent of input as in Paragraph 6, provided these subroutines do
+not change the language in any way that would cause it to fail the
+regression tests for the language.
+
+8. Aggregation of this Package with a commercial distribution is always
+permitted provided that the use of this Package is embedded; that is,
+when no overt attempt is made to make this Package's interfaces visible
+to the end user of the commercial distribution.  Such use shall not be
+construed as a distribution of this Package.
+
+9. The name of the Copyright Holder may not be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+				The End
+

+ 22 - 61
lib/Gedmo/Sluggable/Util/Urlizer.php

@@ -208,86 +208,47 @@ class Urlizer
     * @param string (default = ?) Character use if character unknown
     * @return string US-ASCII string
     */
-    public static function utf8ToAscii($str, $unknown = '?') {
-
-        # The database for transliteration stored here
-        static $UTF8_TO_ASCII = array();
-
-        # Variable lookups faster than accessing constants
-        $UTF8_TO_ASCII_DB = __DIR__ . '/data';
+    public static function utf8ToAscii($str, $unknown = '?')
+    {
+        static $UTF8_TO_ASCII;
 
         if (strlen($str) == 0) {
-            return '';
+            return;
         }
 
-        $len = strlen($str);
-        $i = 0;
-        $result = '';
+        preg_match_all('/.{1}|[^\x00]{1,1}$/us', $str, $ar);
+        $chars = $ar[0];
 
-        while ($i < $len) {
-            $ord = NULL;
-            $increment = 1;
-
-            $ord0 = ord($str{$i});
-
-            # Much nested if /else - PHP fn calls expensive, no block scope...
-            # 1 byte - ASCII
-            if ($ord0 >= 0 && $ord0 <= 127) {
-                $ord = $ord0;
-                $increment = 1;
-            } else {
-                # 2 bytes
-                $ord1 = ord($str{$i+1});
-                if ($ord0 >= 192 && $ord0 <= 223) {
-                    $ord = ($ord0 - 192) * 64 + ($ord1 - 128);
-                    $increment = 2;
-                } else {
-                    # 3 bytes
-                    $ord2 = ord($str{$i+2});
-                    if ($ord0 >= 224 && $ord0 <= 239) {
-                        $ord = ($ord0-224)*4096 + ($ord1-128)*64 + ($ord2-128);
-                        $increment = 3;
-                    } else {
-                        # 4 bytes
-                        $ord3 = ord($str{$i+3});
-                        if ($ord0>=240 && $ord0<=247) {
-                            $ord = ($ord0-240)*262144 + ($ord1-128)*4096
-                                + ($ord2-128)*64 + ($ord3-128);
-                            $increment = 4;
-                        } else {
-                            throw new \Gedmo\Exception\UnexpectedValueException('Unidentified ut8 character was present, pure utf8 required');
-                        }
-                    }
-                }
-            }
+        foreach ($chars as $i => $c) {
+            $ud = 0;
+            if (ord($c{0})>=0 && ord($c{0})<=127) { continue; } // ASCII - next please
+            if (ord($c{0})>=192 && ord($c{0})<=223) { $ord = (ord($c{0})-192)*64 + (ord($c{1})-128); }
+            if (ord($c{0})>=224 && ord($c{0})<=239) { $ord = (ord($c{0})-224)*4096 + (ord($c{1})-128)*64 + (ord($c{2})-128); }
+            if (ord($c{0})>=240 && ord($c{0})<=247) { $ord = (ord($c{0})-240)*262144 + (ord($c{1})-128)*4096 + (ord($c{2})-128)*64 + (ord($c{3})-128); }
+            if (ord($c{0})>=248 && ord($c{0})<=251) { $ord = (ord($c{0})-248)*16777216 + (ord($c{1})-128)*262144 + (ord($c{2})-128)*4096 + (ord($c{3})-128)*64 + (ord($c{4})-128); }
+            if (ord($c{0})>=252 && ord($c{0})<=253) { $ord = (ord($c{0})-252)*1073741824 + (ord($c{1})-128)*16777216 + (ord($c{2})-128)*262144 + (ord($c{3})-128)*4096 + (ord($c{4})-128)*64 + (ord($c{5})-128); }
+            if (ord($c{0})>=254 && ord($c{0})<=255) { $chars{$i} = $unknown; continue; } //error
 
             $bank = $ord >> 8;
 
-            # If we haven't used anything from this bank before, need to load it...
-            if (!array_key_exists($bank, $UTF8_TO_ASCII)) {
-                $bankfile = $UTF8_TO_ASCII_DB. '/'. sprintf("x%02x", $bank).'.php';
-
+            if (!array_key_exists($bank, (array)$UTF8_TO_ASCII)) {
+                $bankfile = __DIR__. '/data/'. sprintf("x%02x",$bank).'.php';
                 if (file_exists($bankfile)) {
-                    # Load the appropriate database
-                    if (!include $bankfile) {
-                        throw new \Gedmo\Exception\RuntimeException('Cannot find character bank file: ' . $bankfile);
-                    }
+                    include $bankfile;
                 } else {
-                    # Some banks are deliberately empty
                     $UTF8_TO_ASCII[$bank] = array();
                 }
             }
 
             $newchar = $ord & 255;
-            if (isset($UTF8_TO_ASCII[$bank]) && array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
-                $result .= $UTF8_TO_ASCII[$bank][$newchar];
+            if (array_key_exists($newchar, $UTF8_TO_ASCII[$bank])) {
+                $chars{$i} = $UTF8_TO_ASCII[$bank][$newchar];
             } else {
-                $result .= $unknown;
+                $chars{$i} = $unknown;
             }
-            $i += $increment;
         }
 
-        return $result;
+        return implode('', $chars);
     }
 
     /**

BIN
lib/Gedmo/Sluggable/Util/data/x00.php