finish all table analyze

This commit is contained in:
Burgess Leo
2025-05-20 16:26:58 +08:00
parent 846c7f6beb
commit 08a47c6d8a
7 changed files with 560 additions and 177 deletions

View File

@@ -227,96 +227,197 @@ def GetFile80hPattern(file_path):
# if __name__ == '__main__':
# GetFile80hPattern(r"Z:\demo.jpg")
# data = GetFile80hPattern(r"Z:\hello.txt")
# print(data)
def analyze_ntfs_data_attribute(data):
def ExtractSequenceHexValues(file80h_pattern):
"""
分析 NTFS 数据结构中的80属性($DATA),返回文件分片数量
从给定的数据结构中提取所有 sequence 的十六进制字符串,并合并成一个标准列表
参数:
data (list): 包含字典的列表,每个字典'sequence'
(示例结构见问题描述)
data (list): 包含字典的列表,每个字典有 'sequence'
返回:
int: 分片数量(常驻属性返回1非常驻属性返回数据运行的分片数)
异常:
ValueError: 当输入数据无效时抛出
list: 包含所有 sequence 值的合并列表
"""
# 第一步提取并转换sequence数据
hex_bytes = []
for entry in data:
sequence_list = []
for entry in file80h_pattern:
if 'sequence' in entry:
# 将每个十六进制字符串按空格分割,然后合并到结果列表
for hex_str in entry['sequence']:
hex_bytes.extend(hex_str.split())
# 分割字符串并添加到结果
sequence_list.extend(hex_str.split())
return sequence_list
# 将十六进制字符串转换为整数列表
try:
attribute_data = [int(x, 16) for x in hex_bytes]
except ValueError:
raise ValueError("无效的十六进制数据")
# 第二步:分析属性结构
if len(attribute_data) < 24:
raise ValueError("属性数据过短,无法解析头部信息")
def ExportDataRunList(data_run):
"""
将 data_run 中的多个 Data Run 提取为独立的 list 片段。
# 检查属性类型(0x80)
if attribute_data[0] != 0x80:
raise ValueError("不是80属性($DATA属性)")
参数:
data_run (list): 十六进制字符串组成的列表,表示 Data Run 内容
# 检查是否常驻(偏移0x08)
is_resident = attribute_data[8] == 0
返回:
list: 每个元素是一个代表单个 Data Run 的 list
"""
result = []
pos = 0
if is_resident:
return 1
else:
# 解析非常驻属性的数据运行列表
data_run_offset = attribute_data[0x20] | (attribute_data[0x21] << 8)
while pos < len(data_run):
current_byte = data_run[pos]
if data_run_offset >= len(attribute_data):
raise ValueError("数据运行偏移超出属性长度")
if current_byte == '00':
# 遇到空运行块,停止解析
break
data_runs = attribute_data[data_run_offset:]
fragment_count = 0
pos = 0
try:
header = int(current_byte, 16)
len_bytes = (header >> 4) & 0x0F
offset_bytes = header & 0x0F
while pos < len(data_runs):
header_byte = data_runs[pos]
if header_byte == 0x00:
if len_bytes == 0 or offset_bytes == 0:
print(f"⚠️ 无效的字段长度,跳过位置 {pos}")
break
len_len = (header_byte >> 4) & 0x0F
offset_len = header_byte & 0x0F
# 计算当前 Data Run 总长度
run_length = 1 + offset_bytes + len_bytes
if len_len == 0 or offset_len == 0:
break
# 截取当前 Data Run
fragment = data_run[pos: pos + run_length]
pos += 1 + len_len + offset_len
fragment_count += 1
result.append(fragment)
return fragment_count
# 移动指针
pos += run_length
except Exception as e:
print(f"❌ 解析失败,位置 {pos}{e}")
break
return result
input_data = [
{
'start_byte': 3221267456,
'offset': 264,
'sequence': [
'80 00 00 00 48 00 00 00',
'01 00 00 00 00 00 01 00',
'00 00 00 00 00 00 00 00',
'79 00 00 00 00 00 00 00',
'40 00 00 00 00 00 00 00',
'00 a0 07 00 00 00 00 00',
'0b 93 07 00 00 00 00 00',
'0b 93 07 00 00 00 00 00',
'31 7a 00 ee 0b 00 00 00'
],
'is_resident': False,
'total_groups': 9,
'attribute_length': 72
def hex_list_to_int(lst, byteorder='little'):
"""
将十六进制字符串列表转换为整数(支持小端序)
"""
if byteorder == 'little':
lst = list(reversed(lst))
return int(''.join(f"{int(b, 16):02x}" for b in lst), 16)
def parse_data_run(data_run, previous_cluster=0, cluster_size=512):
"""
解析 NTFS 单个 Data Run返回起始字节、结束字节、长度字节
参数:
data_run (list): Data Run 的十六进制字符串列表
previous_cluster (int): 上一个运行块的最后一个簇号(用于相对偏移)
cluster_size (int): 簇大小(默认为 512 字节)
返回:
dict: 包含起始字节、结束字节、长度等信息
"""
if not data_run or data_run[0] == '00':
return None
header = int(data_run[0], 16)
len_bytes = (header >> 4) & 0x0F
offset_bytes = header & 0x0F
# 提取偏移字段和长度字段
offset_data = data_run[1:1 + offset_bytes]
length_data = data_run[1 + offset_bytes:1 + offset_bytes + len_bytes]
# 小端序转整数
def hex_list_to_int(lst):
return int(''.join(f"{int(b, 16):02x}" for b in reversed(lst)), 16)
offset = hex_list_to_int(offset_data)
run_length = hex_list_to_int(length_data)
# 计算起始簇号
starting_cluster = previous_cluster + offset
ending_cluster = starting_cluster + run_length - 1
# 转换为字节偏移
cluster_per_sector = 8
byte_per_sector = cluster_size
byte_length = starting_cluster * cluster_per_sector * byte_per_sector
starting_byte = run_length * cluster_per_sector * byte_per_sector
ending_byte = starting_byte + byte_length - 1
return {
"starting_byte": starting_byte,
"ending_byte": ending_byte,
"byte_length": byte_length,
"starting_cluster": starting_cluster,
"run_length_clusters": run_length
}
]
print(analyze_ntfs_data_attribute(input_data)) # 输出分片数量
def ParseMultipleDataRuns(fragments, cluster_size=512):
"""
批量解析多个 Data Run 片段,返回字节偏移信息。
参数:
fragments (list): 多个 Data Run 字符串列表
cluster_size (int): 簇大小(默认为 512
返回:
list: 每个元素是一个包含字节偏移信息的 dict
"""
results = []
previous_starting_cluster = 0
for fragment in fragments:
result = parse_data_run(fragment, previous_starting_cluster, cluster_size)
if result:
results.append(result)
previous_starting_cluster = result["starting_cluster"]
return results
def GetFragmentData(file80h_pattern):
if not file80h_pattern or not isinstance(file80h_pattern, list):
return []
if file80h_pattern[0].get('is_resident'):
start_byte = file80h_pattern[0].get('start_byte')
offset = file80h_pattern[0].get('offset')
content_start = file80h_pattern[0].get('sequence')[2]
content_start_list = content_start.split()
content_len = content_start_list[::-1][4:8]
content_offset = content_start_list[::-1][:4]
content_len_str = ''.join(content_len)
content_len_decimal_value = int(content_len_str, 16)
content_offset_str = ''.join(content_offset)
content_offset_decimal_value = int(content_offset_str, 16)
file_offset = start_byte + offset + content_offset_decimal_value
return [{
'starting_byte': file_offset,
'byte_length': content_len_decimal_value
}]
else:
sequence_list = ExtractSequenceHexValues(file80h_pattern)
data_run_offset = sequence_list[32:34][::-1]
data_run_offset_str = ''.join(data_run_offset)
data_run_offset_decimal_value = int(data_run_offset_str, 16)
data_run_list = sequence_list[data_run_offset_decimal_value:]
fragments = ExportDataRunList(data_run_list)
results = ParseMultipleDataRuns(fragments)
return results
# if __name__ == '__main__':
# arri80_data = GetFile80hPattern(r"Z:\hello.txt")
# data = GetFragmentData(arri80_data)
# print(data)