Skip to content
93 changes: 52 additions & 41 deletions html5lib/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
Expand Down Expand Up @@ -380,7 +379,8 @@ def tagOpenState(self):
self.state = self.closeTagOpenState
elif data in asciiLetters:
self.currentToken = {"type": tokenTypes["StartTag"],
"name": data, "data": [],
"name": data.translate(asciiUpper2Lower),
"data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
Expand Down Expand Up @@ -410,7 +410,8 @@ def tagOpenState(self):
def closeTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
self.currentToken = {"type": tokenTypes["EndTag"],
"name": data.translate(asciiUpper2Lower),
"data": [], "selfClosing": False}
self.state = self.tagNameState
elif data == ">":
Expand Down Expand Up @@ -448,7 +449,7 @@ def tagNameState(self):
"data": "invalid-codepoint"})
self.currentToken["name"] += "\uFFFD"
else:
self.currentToken["name"] += data
self.currentToken["name"] += data.translate(asciiUpper2Lower)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very skeptical about this being a perf win, versus it being in emitCurrentToken. What do the benchmarks say?

Yes, emitCurrentToken's lowercasing becomes redundant in the RCDATA/RAWTEXT/script cases, but I expect the cost of this will negate any gains.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair, yep - especially for short element names it seems likely that the translate method call overhead (especially if called repeatedly) could negate any benefits provided by simpler comparisons.

I hadn't assessed the performance of this code path separately; it felt worth maintaining consistency but I don't believe there's a noticeable performance change.

# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
Expand Down Expand Up @@ -476,26 +477,29 @@ def rcdataEndTagOpenState(self):
return True

def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -526,26 +530,29 @@ def rawtextEndTagOpenState(self):
return True

def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -579,26 +586,29 @@ def scriptDataEndTagOpenState(self):
return True

def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -715,26 +725,29 @@ def scriptDataEscapedEndTagOpenState(self):
return True

def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
return True

name = self.temporaryBuffer.translate(asciiUpper2Lower)
appropriate = self.currentToken and self.currentToken["name"] == name
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"name": name,
"data": [], "selfClosing": False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "</" + self.temporaryBuffer})
Expand Down Expand Up @@ -776,7 +789,9 @@ def scriptDataDoubleEscapedState(self):
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
chars = self.stream.charsUntil(("<", "-", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True

def scriptDataDoubleEscapedDashState(self):
Expand Down Expand Up @@ -859,7 +874,8 @@ def beforeAttributeNameState(self):
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
attr_name = data.translate(asciiUpper2Lower)
self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == ">":
self.emitCurrentToken()
Expand Down Expand Up @@ -891,8 +907,7 @@ def attributeNameState(self):
if data == "=":
self.state = self.beforeAttributeValueState
elif data in asciiLetters:
self.currentToken["data"][-1][0] += data +\
self.stream.charsUntil(asciiLetters, True)
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False
elif data == ">":
# XXX If we emit here the attributes are converted to a dict
Expand All @@ -919,15 +934,13 @@ def attributeNameState(self):
"data": "eof-in-attribute-name"})
self.state = self.dataState
else:
self.currentToken["data"][-1][0] += data
self.currentToken["data"][-1][0] += data.translate(asciiUpper2Lower)
leavingThisState = False

if leavingThisState:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
Expand All @@ -947,7 +960,8 @@ def afterAttributeNameState(self):
elif data == ">":
self.emitCurrentToken()
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
attr_name = data.translate(asciiUpper2Lower)
self.currentToken["data"].append([attr_name, ""])
self.state = self.attributeNameState
elif data == "/":
self.state = self.selfClosingStartTagState
Expand Down Expand Up @@ -1341,17 +1355,15 @@ def beforeDoctypeNameState(self):
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] = data
self.currentToken["name"] = data.translate(asciiUpper2Lower)
self.state = self.doctypeNameState
return True

def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.state = self.afterDoctypeNameState
elif data == ">":
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == "\u0000":
Expand All @@ -1363,11 +1375,10 @@ def doctypeNameState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype-name"})
self.currentToken["correct"] = False
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] += data
self.currentToken["name"] += data.translate(asciiUpper2Lower)
return True

def afterDoctypeNameState(self):
Expand Down