Files
fastcopy/ntfs_utils/db_node.py
Burgess Leo 697b449bff xx
2025-05-19 17:33:30 +08:00

306 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import hashlib
import os
import random
import sqlite3
from datetime import datetime
from mft_analyze import GetFile80hPattern
# 工具函数:获取文件扩展名
def GetFileExtension(name: str) -> str:
parts = name.rsplit('.', 1)
return parts[1].lower() if len(parts) > 1 else ""
# 获取 ExtendNameID基于文件名后缀
def GetExtendNameId(name: str, cursor: sqlite3.Cursor) -> int:
ext = GetFileExtension(name)
if not ext:
return 0
cursor.execute("SELECT ID FROM db_extend_name WHERE ExtendName = ?", (ext,))
result = cursor.fetchone()
return result[0] if result else 0
# 获取 DirLayer路径层级
def GetDirLayer(path: str) -> int:
path = path.strip()
if not path or path == "\\":
return 0
return path.count("\\") - 1
# 获取 GroupID默认第一个
def GetFirstGroupId(cursor: sqlite3.Cursor) -> int:
cursor.execute("SELECT ID FROM db_group ORDER BY ID LIMIT 1")
result = cursor.fetchone()
return result[0] if result else 0
# 获取 UserID默认第一个
def GetFirstUserId(cursor: sqlite3.Cursor) -> int:
cursor.execute("SELECT ID FROM db_user ORDER BY ID LIMIT 1")
result = cursor.fetchone()
return result[0] if result else 0
def GetFilesTime(file_path):
"""
获取指定文件的创建时间、修改时间、访问时间和权限变更时间。
st_atime: 最后一次访问时间FileAccessTime
st_mtime: 最后一次修改内容的时间FileModifyTime
st_ctime: 文件元数据metadata更改时间在 Windows 中是文件创建时间FileCreateTime
注意Windows 和 Linux 在这些字段的定义上略有不同,比如 Linux 中 st_ctime 是元数据变更时间,而不是创建时间。
参数:
file_path (str): 文件的绝对路径
返回:
dict: 包含 FileCreateTime, FileModifyTime, FileAccessTime, FileAuthTime 的字符串格式,
如果无法获取则返回 "default"
"""
if not os.path.exists(file_path):
return {
"FileCreateTime": "default",
"FileModifyTime": "default",
"FileAccessTime": "default",
"FileAuthTime": "default"
}
try:
stat_info = os.stat(file_path)
# 将时间戳转换为可读格式字符串 ISO 8601 格式
def ts_to_str(timestamp):
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
create_time = ts_to_str(stat_info.st_ctime)
modify_time = ts_to_str(stat_info.st_mtime)
access_time = ts_to_str(stat_info.st_atime)
# 权限变更时间Linux 上是 metadata 修改时间Windows 上可能不适用
try:
auth_time = ts_to_str(getattr(stat_info, 'st_birthtime', stat_info.st_ctime))
except Exception:
auth_time = "default"
return {
"FileCreateTime": create_time,
"FileModifyTime": modify_time,
"FileAccessTime": access_time,
"FileAuthTime": auth_time
}
except Exception as e:
print(f"❌ 获取文件时间失败: {e}")
return {
"FileCreateTime": "default",
"FileModifyTime": "default",
"FileAccessTime": "default",
"FileAuthTime": "default"
}
# 获取设备IDdb_device第一条记录
def GetDeviceId(cursor: sqlite3.Cursor) -> int:
cursor.execute("SELECT ID FROM db_device ORDER BY ID LIMIT 1")
result = cursor.fetchone()
return result[0] if result else 0
# 获取文件大小(伪数据)
def GetFileSize(full_path: str) -> int:
return random.randint(100, 999)
# 获取文件内容哈希(伪数据)
def GetFileHash(full_path: str) -> str:
return hashlib.sha256(full_path.encode()).hexdigest()
# 获取分片数1~4
def GetExtentCount(data):
"""
分析 NTFS 数据结构中的80属性($DATA),返回文件分片数量
参数:
data (list): 包含字典的列表,每个字典需有'sequence'
(示例结构见问题描述)
返回:
int: 分片数量(常驻属性返回1非常驻属性返回数据运行的分片数)
异常:
ValueError: 当输入数据无效时抛出
"""
# 第一步提取并转换sequence数据
hex_bytes = []
for entry in data:
if 'sequence' in entry:
for hex_str in entry['sequence']:
hex_bytes.extend(hex_str.split())
# 将十六进制字符串转换为整数列表
try:
attribute_data = [int(x, 16) for x in hex_bytes]
except ValueError:
raise ValueError("无效的十六进制数据")
# 第二步:分析属性结构
if len(attribute_data) < 24:
raise ValueError("属性数据过短,无法解析头部信息")
# 检查属性类型(0x80)
if attribute_data[0] != 0x80:
raise ValueError("不是80属性($DATA属性)")
# 检查是否常驻(偏移0x08)
is_resident = attribute_data[8] == 0
if is_resident:
return 1
else:
# 解析非常驻属性的数据运行列表
data_run_offset = attribute_data[0x20] | (attribute_data[0x21] << 8)
if data_run_offset >= len(attribute_data):
raise ValueError("数据运行偏移超出属性长度")
data_runs = attribute_data[data_run_offset:]
fragment_count = 0
pos = 0
while pos < len(data_runs):
header_byte = data_runs[pos]
if header_byte == 0x00:
break
len_len = (header_byte >> 4) & 0x0F
offset_len = header_byte & 0x0F
if len_len == 0 or offset_len == 0:
break
pos += 1 + len_len + offset_len
fragment_count += 1
return fragment_count
# 获取随机位置
def GetRandomLocation() -> int:
return random.randint(1000, 9999)
# 获取随机长度
def GetRandomLength() -> int:
return random.randint(1000, 9999)
# 主函数:将 db_path 数据导入 db_node
def InsertNodeDataToDB(db_path='../src/db_ntfs_info.db', table_name='db_node'):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
group_id = GetFirstGroupId(cursor)
user_id = GetFirstUserId(cursor)
device_id = GetDeviceId(cursor)
cursor.execute("SELECT ID, Path, Name, ParentID FROM db_path")
rows = cursor.fetchall()
for row in rows:
path_id, full_path, name, parent_id = row
# 检查是否已存在相同 PathID
cursor.execute("SELECT COUNT(*) FROM db_node WHERE PathID = ?", (path_id,))
exists = cursor.fetchone()[0]
if exists > 0:
print(f"⚠️ PathID {path_id} 已存在,跳过插入")
continue
# 计算字段
name_hash = hashlib.sha256(name.encode()).hexdigest()
dir_layer = GetDirLayer(full_path)
extend_name_id = GetExtendNameId(name, cursor)
file_size = GetFileSize(full_path)
file_hash = GetFileHash(full_path)
# 获取文件的时间属性
file_times = GetFilesTime(full_path)
create_time = file_times["FileCreateTime"]
modify_time = file_times["FileModifyTime"]
access_time = file_times["FileAccessTime"]
auth_time = file_times["FileAuthTime"]
# 新增:根据 $80 属性获取更精确的 ExtentCount
try:
attribute_80_data = GetFile80hPattern(full_path)
if not attribute_80_data or not isinstance(attribute_80_data, list):
raise ValueError("无效的 80h 属性数据")
extent_count = GetExtentCount(attribute_80_data)
print(f"✅ 分片数量为: {extent_count}")
except Exception as e:
print(f"⚠️ 获取 ExtentCount 失败,使用默认值 0: {e}")
extent_count = 0
# 构建插入语句字段和参数(保持原样)
fields = [
'PathID', 'ParentID', 'NameHash', 'PathHash',
'ExtendNameID', 'DirLayer', 'GroupID', 'UserID',
'FileCreateTime', 'FileModifyTime', 'FileAccessTime', 'FileAuthTime',
'FileSize', 'FileMode', 'FileHash', 'ExtentCount'
]
values = [
path_id, parent_id, name_hash, '', # PathHash 待填
extend_name_id, dir_layer, group_id, user_id,
create_time, modify_time, access_time, auth_time,
file_size, 'default', file_hash, extent_count
]
# 查询 PathHash与 db_path.PathHash 一致)
cursor.execute("SELECT PathHash FROM db_path WHERE ID = ?", (path_id,))
path_hash_result = cursor.fetchone()
path_hash = path_hash_result[0] if path_hash_result else ""
values[3] = path_hash # 替换 PathHash
# 处理 Extent 片段字段
extent_data = []
for i in range(1, 5):
if i <= extent_count:
location = GetRandomLocation()
length = GetRandomLength()
extent_data.extend([device_id, location, length])
else:
extent_data.extend([None, None, None])
# 拼接字段和值
extent_fields = [
"extent1_DeviceID", "extent1_Location", "extent1_Length",
"extent2_DeviceID", "extent2_Location", "extent2_Length",
"extent3_DeviceID", "extent3_Location", "extent3_Length",
"extent4_DeviceID", "extent4_Location", "extent4_Length"
]
fields += extent_fields
values += extent_data
# 构建 SQL 插入语句
placeholders = ', '.join('?' * len(values))
insert_sql = f"INSERT INTO {table_name} ({', '.join(fields)}) VALUES ({placeholders})"
# 执行插入
cursor.execute(insert_sql, values)
conn.commit()
conn.close()
print(f"✅ 数据已成功插入到 {table_name}")
if __name__ == '__main__':
InsertNodeDataToDB()