<?php
$python_tokens = array("for","in");
function python_tokenize($code) {
global $python_tokens;
$c = python_sanitize($code);
$tokens = array();
$indent = 0;
while (strlen($c)) {
if (re('/^[ ]+/',$c,$r)) {
# ignored
} else if (re('/^[\t]+/',$c,$r)) {
# indent/dedent
$d = strlen($r[0]) - $indent;
switch($d) {
case 0: break;
case +1: $tokens[] = token("INDENT"); $indent++; break;
case -1: $tokens[] = token("DEDENT"); $indent--; break;
default: die("Expectend indented block");
}
} else if (re('/^\n/',$c,$r)) {
# newline
$last = @$tokens[count($tokens)-1];
if (isset($last["name"])&&$last["name"]!="NEWLINE") {
$tokens[] = token("NEWLINE");
}
} else if (re('/^#.*/',$c,$r)) {
# sharp comment
python_parse_comment($r[0]);
} else if (re('/^,/',$c,$r)) {
# colon
$tokens[] = token("COMMA");
} else if (re('/^:/',$c,$r)) {
# colon
$tokens[] = token("COLON");
} else if (re('/^;/',$c,$r)) {
# colon
$tokens[] = token("SEMICOLON");
} else if (re('/^=/',$c,$r)) {
# round branch open
$tokens[] = token("EQUAL");
} else if (re('/^\(/',$c,$r)) {
# round branch open
$tokens[] = token("RBO");
} else if (re('/^\)/',$c,$r)) {
# round branch close
$tokens[] = token("RBC");
} else if (re('/^\[/',$c,$r)) {
# round branch open
$tokens[] = token("SBO");
} else if (re('/^\]/',$c,$r)) {
# round branch close
$tokens[] = token("SBC");
} else if (re('/^{/',$c,$r)) {
# round branch open
$tokens[] = token("BO");
} else if (re('/^}/',$c,$r)) {
# round branch close
$tokens[] = token("BC");
} else if (re('/^print/',$c,$r)) {
# print token
$tokens[] = token("PRINT");
} else if (re('/^True/',$c,$r)) {
# round branch close
$tokens[] = token("TRUE");
} else if (re('/^False/',$c,$r)) {
# round branch close
$tokens[] = token("FALSE");
} else if (re('/^None/',$c,$r)) {
# round branch close
$tokens[] = token("NONE");
} else if (re('/^\.\.\./',$c,$r)) {
# round branch close
$tokens[] = token("TRIEPLEDOT");
} else if (re('/^[0-9]+(\.[0-9]*)?/',$c,$r)) {
# number
$tokens[] = token("NUMBER",$r[0]);
} else if (re('/^"([^"]+((\\\\")*[^"]+))"/',$c,$r)) {
# double quoted string
$tokens[] = token("DSTRING",$r[1]);
} else if (re('/^[A-Za-z_][A-Za-z0-1_]*/',$c,$r)) {
# name or litteral-token
if (@in_array($r[0],$python_tokens)) {
$tokens[] = token(strtoupper($r[0]));
} else {
$tokens[] = token("LNAME",$r[0]);
}
} else {
# rest
$r[0] =$c[0];
}
$c = substr($c, strlen($r[0]));
}
return $tokens;
}