<?php
require_once __DIR__.'/python.token.php';
require_once __DIR__.'/python.literal.php';
require_once __DIR__.'/python.interface.php';
class lang_python_tokenize extends lang_python_interface {
public static function python_tokenize($code) {
$c = rtrim($code);
$o = array();
$i = 0;
$l = 1;
$z = 0;
$g = array();
do {
$v = static::python_tokenize_newline($c);
if($v>0){
$c=substr($c,$v);
$l++;
}
} while($v>0);
while (strlen($c) && $z < 100) {
$n = 0;
do {
$v = static::python_tokenize_newline($c);
if ($v > 0) {
$c = substr($c,$v);
$n+= $v;
$l++;
}
} while ($v>0);
if ($n > 0) {
$o[] = new lang_python_token('NEWLINE');
}
$w = static::python_tokenize_whitespace($c);
if ($w > 0) {
$s = substr($c, 0, $w);
$c = substr($c, $w);
} else {
$s = '';
}
if ($n > 0) {
$e = static::python_tokenize_compute_space($s);
if ($e != $i) {
if ($e > $i) {
$o[] = new lang_python_token('INDENT');
$g[] = $e;
$i = $e;
} else if ($e==0 || in_array($e,$g)) {
do {
$h = array_pop($g);
if ($h==$e) {break;}
$o[] = new lang_python_token('DEDENT');
} while (count($g)>0);
$g[] = $e;
$i = $e;
} else {
$o[] = new lang_python_token('FAIL');
}
}
}
$t = static::python_tokenize_next($c);
$o[] = $t;
$c = substr($c,$t->length);
$z++;
}
$o[] = new lang_python_token('NEWLINE');
while (count($g)>0) {
$o[] = new lang_python_token('DEDENT');
$h = array_pop($g);
}
$o[] = new lang_python_token('ENDMARKER');
echo '<pre>';
foreach($o as $t) {
echo "$t->name: ".$t->getValue()."\n";
}
echo '</pre>';
return $o;
}
##
public static function python_tokenize_newline($c) {
$l = 0;
$k = false;
if(isset($c[$l])){if(ord($c[$l])==13){$l++;$k=true;}}
if(isset($c[$l])){if(ord($c[$l])==10){$l++;$k=true;}}
if($k){return $l;}
if (re('/(^[ \t]+)/',$c,$s)) {
$l = strlen($s[0]);
$k = false;
if(isset($c[$l])){if(ord($c[$l])==13){$l++;$k=true;}}
if(isset($c[$l])){if(ord($c[$l])==10){$l++;$k=true;}}
if($k){return $l;}
}
return 0;
}
##
public static function python_tokenize_whitespace($c) {
$l = 0;
if (re('/(^[ \t]+)/',$c,$s)) {
$l = strlen($s[0]);
return $l;
}
return 0;
}
##
public static function python_tokenize_next($c) {
$h = array(
'/^,/' => "COMMA",
'/^\./' => "DOT",
'/^:/' => "COLON",
'/^;/' => "SEMICOLON",
'/^=/' => "EQ",
'/^\+/' => "PLUS",
'/^%/' => "MOD",
'/^==/' => "EQUAL",
'/^>/' => "GREAT",
'/^</' => "LESS",
'/^>=/' => "GREATEQUAL",
'/^<=/' => "LESSEQUAL",
'/^\(/' => "RBO",
'/^\)/' => "RBC",
'/^\[/' => "SBO",
'/^\]/' => "SBC",
'/^{/' => "BO",
'/^}/' => "BC",
'/^print/' => "PRINT",
'/^import/' => "IMPORT",
'/^from/' => "FROM",
'/^class/' => "CLASS",
'/^def/' => "DEF",
'/^return/' => "RETURN",
'/^while/' => "WHILE",
'/^if/' => 'IF',
'/^True/' => 'TRUE',
'/^False/' => 'FALSE',
'/^None/' => 'NONE',
'/^\.\.\./' => 'TRIEPLEDOT',
);
foreach($h as $r=>$t) {
if (re($r,$c,$s)) {
return new lang_python_token($t,$s[0],strlen($s[0]));
}
}
if (re('|^#.*|',$c,$s)) {
return python_parse_comment($s[0]);
} else if (re('/^[0-9]+(\.[0-9]*)?/',$c,$s)) {
return new lang_python_token("NUMBER",(int)$s[0],strlen($s[0]));
} else if (re('/^"""(.*)"""/',$c,$s)) {
return new lang_python_token("STRING",$s[1],strlen($s[0]));
} else if (re('/^"([^"]+((\\\\")*[^"]+))"/',$c,$s)) {
return new lang_python_token("STRING",(string)$s[1],strlen($s[0]));
} else if (re('/^\'([^"]+((\\\\\')*[^\']+))\'/',$c,$s)) {
return new lang_python_token("STRING",(string)$s[1],strlen($s[0]));
} else if (re('|^[A-Za-z_][A-Za-z0-1_]*|',$c,$s)) {
return new lang_python_token("NAME",new lang_python_literal($s[0]),strlen($s[0]));
}
return new lang_python_token('FAIL');
}
public static function python_tokenize_compute_space($s) {
$l = 0;
for($i=0;$i<strlen($s);$i++) {
$c = $s[$i];
if ($c==" ") {
$l=$l+1;
} else if ($c=="\t") {
$l=$l+8;
}
}
return $l;
}
}