begin implementing intel opcode syntax parser

This commit is contained in:
mykola2312 2024-08-01 02:24:43 +03:00
parent 24c290e29e
commit 5998950f23

92
genc.py
View file

@ -1,83 +1,51 @@
import re import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
class OpCode: class Instruction:
OPCODE_REGEX = re.compile("[0-9A-F][0-9A-F]") REX_REGEX = re.compile("^REX\\.(.)")
BYTES_REGEX = re.compile("([0-9A-F][0-9A-F])")
DIGIT_REGEX = re.compile("\\/(\\d)")
MODRM_REGEX = re.compile("\\/r")
IMM_REGEX = re.compile("i(.)")
VALUE_REGEX = re.compile("c(.)")
OPREG_REGEX = re.compile("r(.)")
def __init__(self, ins, operand_encodings): def __init__(self, ins):
self.x32m = ins.attrib["x32m"] self.x32m = ins.attrib["x32m"]
self.x64m = ins.attrib["x64m"] self.x64m = ins.attrib["x64m"]
self.args = ins.find("args").text
opc = ins.find("opc") opc = ins.find("opc").text
self.opcode = OpCode.OPCODE_REGEX.findall(opc.text) if "VEX" in opc: return
openc = opc.attrib.get("openc") rex = Instruction.REX_REGEX.search(opc)
if openc: bytes = Instruction.BYTES_REGEX.findall(opc)
self.operand_encoding = operand_encodings.get(openc, openc) digit = Instruction.DIGIT_REGEX.search(opc)
else: self.operand_encoding = None modrm = Instruction.MODRM_REGEX.search(opc)
imm = Instruction.IMM_REGEX.search(opc)
value = Instruction.VALUE_REGEX.search(opc)
opreg = Instruction.OPREG_REGEX.search(opc)
def __str__(self): print(ins.find("mnem").text)
return f"\topcode {self.opcode} args {self.args} op_enc {self.operand_encoding}" if rex: print("rex\t", rex.group(1))
print(bytes)
def __eq__(self, other): if digit: print("digit\t", digit.group(1))
return self.opcode == other.opcode and self.operand_encoding == other.operand_encoding if modrm: print("modrm\t", modrm.group(0))
if imm: print("imm\t", imm.group(1))
def __key(self): if value: print("value\t", value.group(1))
return ("".join(self.opcode), "".join(self.operand_encoding or [])) if opreg: print("opreg\t", opreg.group(1))
def __hash__(self):
return hash(self.__key())
class Instruction:
SKIP_16BIT_REALMODE = ["rel16", "imm16", "ptr16:16"]
def contains_16bit_mode(args):
for needle in Instruction.SKIP_16BIT_REALMODE:
if needle in args:
return True
class InstructionGroup:
def __init__(self, common): def __init__(self, common):
self.brief = common.find("brief").text self.brief = common.find("brief").text
self.instructions = [Instruction(ins) for ins in common.iter("ins")]
operand_encodings = {}
for operand_encoding in common.iter("oprndenc"):
name = operand_encoding.attrib["openc"]
operands = []
operands.append(operand_encoding.find("oprnd1").text)
operands.append(operand_encoding.find("oprnd2").text)
operands.append(operand_encoding.find("oprnd3").text)
operands.append(operand_encoding.find("oprnd4").text)
operand_encodings[name] = operands
self.opcodes = []
for ins in common.iter("ins"):
self.opcodes.append(OpCode(ins, operand_encodings))
# remove 16 bit real mode displacement value opcodes
self.opcodes = list(filter(lambda op: not Instruction.contains_16bit_mode(op.args), self.opcodes))
# de-duplicate opcodes with set
_opcodes = self.opcodes
self.opcodes = set()
for op in _opcodes:
self.opcodes.add(op)
def parse_file(path): def parse_file(path):
tree = ET.parse(path) tree = ET.parse(path)
root = tree.getroot() root = tree.getroot()
instructions = [] groups = [InstructionGroup(common) for common in root.iter("common")]
for common in root: return groups
instructions.append(Instruction(common))
for instruction in instructions:
print(instruction.brief)
for opcode in instruction.opcodes:
print(opcode)
if __name__ == "__main__": if __name__ == "__main__":