fastcopy/ntfs_utils/db_node.py

import hashlib
import os
import random
import sqlite3
from datetime import datetime

from mft_analyze import GetFile80hPattern


# 工具函数：获取文件扩展名
def GetFileExtension(name: str) -> str:
    parts = name.rsplit('.', 1)
    return parts[1].lower() if len(parts) > 1 else ""


# 获取 ExtendNameID（基于文件名后缀）
def GetExtendNameId(name: str, cursor: sqlite3.Cursor) -> int:
    ext = GetFileExtension(name)
    if not ext:
        return 0

    cursor.execute("SELECT ID FROM db_extend_name WHERE ExtendName = ?", (ext,))
    result = cursor.fetchone()
    return result[0] if result else 0


# 获取 DirLayer（路径层级）
def GetDirLayer(path: str) -> int:
    path = path.strip()
    if not path or path == "\\":
        return 0
    return path.count("\\") - 1


# 获取 GroupID（默认第一个）
def GetFirstGroupId(cursor: sqlite3.Cursor) -> int:
    cursor.execute("SELECT ID FROM db_group ORDER BY ID LIMIT 1")
    result = cursor.fetchone()
    return result[0] if result else 0


# 获取 UserID（默认第一个）
def GetFirstUserId(cursor: sqlite3.Cursor) -> int:
    cursor.execute("SELECT ID FROM db_user ORDER BY ID LIMIT 1")
    result = cursor.fetchone()
    return result[0] if result else 0


def GetFilesTime(file_path):
    """
    获取指定文件的创建时间、修改时间、访问时间和权限变更时间。
    st_atime: 最后一次访问时间（FileAccessTime）
    st_mtime: 最后一次修改内容的时间（FileModifyTime）
    st_ctime: 文件元数据（metadata）更改时间，在 Windows 中是文件创建时间（FileCreateTime）
    注意：Windows 和 Linux 在这些字段的定义上略有不同，比如 Linux 中 st_ctime 是元数据变更时间，而不是创建时间。
    参数:
        file_path (str): 文件的绝对路径

    返回:
        dict: 包含 FileCreateTime, FileModifyTime, FileAccessTime, FileAuthTime 的字符串格式，
              如果无法获取则返回 "default"。
    """
    if not os.path.exists(file_path):
        return {
            "FileCreateTime": "default",
            "FileModifyTime": "default",
            "FileAccessTime": "default",
            "FileAuthTime": "default"
        }

    try:
        stat_info = os.stat(file_path)

        # 将时间戳转换为可读格式字符串 ISO 8601 格式
        def ts_to_str(timestamp):
            return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

        create_time = ts_to_str(stat_info.st_ctime)
        modify_time = ts_to_str(stat_info.st_mtime)
        access_time = ts_to_str(stat_info.st_atime)

        # 权限变更时间，Linux 上是 metadata 修改时间，Windows 上可能不适用
        try:
            auth_time = ts_to_str(getattr(stat_info, 'st_birthtime', stat_info.st_ctime))
        except Exception:
            auth_time = "default"

        return {
            "FileCreateTime": create_time,
            "FileModifyTime": modify_time,
            "FileAccessTime": access_time,
            "FileAuthTime": auth_time
        }

    except Exception as e:
        print(f"❌ 获取文件时间失败: {e}")
        return {
            "FileCreateTime": "default",
            "FileModifyTime": "default",
            "FileAccessTime": "default",
            "FileAuthTime": "default"
        }


# 获取设备ID（db_device第一条记录）
def GetDeviceId(cursor: sqlite3.Cursor) -> int:
    cursor.execute("SELECT ID FROM db_device ORDER BY ID LIMIT 1")
    result = cursor.fetchone()
    return result[0] if result else 0


# 获取文件大小（伪数据）
def GetFileSize(full_path: str) -> int:
    return random.randint(100, 999)


# 获取文件内容哈希（伪数据）
def GetFileHash(full_path: str) -> str:
    return hashlib.sha256(full_path.encode()).hexdigest()


# 获取分片数（1~4）
def GetExtentCount(data):
    """
    分析 NTFS 数据结构中的80属性($DATA)，返回文件分片数量

    参数:
        data (list): 包含字典的列表，每个字典需有'sequence'键
            (示例结构见问题描述)

    返回:
        int: 分片数量(常驻属性返回1，非常驻属性返回数据运行的分片数)

    异常:
        ValueError: 当输入数据无效时抛出
    """
    # 第一步：提取并转换sequence数据
    hex_bytes = []
    for entry in data:
        if 'sequence' in entry:
            for hex_str in entry['sequence']:
                hex_bytes.extend(hex_str.split())

    # 将十六进制字符串转换为整数列表
    try:
        attribute_data = [int(x, 16) for x in hex_bytes]
    except ValueError:
        raise ValueError("无效的十六进制数据")

    # 第二步：分析属性结构
    if len(attribute_data) < 24:
        raise ValueError("属性数据过短，无法解析头部信息")

    # 检查属性类型(0x80)
    if attribute_data[0] != 0x80:
        raise ValueError("不是80属性($DATA属性)")

    # 检查是否常驻(偏移0x08)
    is_resident = attribute_data[8] == 0

    if is_resident:
        return 1
    else:
        # 解析非常驻属性的数据运行列表
        data_run_offset = attribute_data[0x20] | (attribute_data[0x21] << 8)

        if data_run_offset >= len(attribute_data):
            raise ValueError("数据运行偏移超出属性长度")

        data_runs = attribute_data[data_run_offset:]
        fragment_count = 0
        pos = 0

        while pos < len(data_runs):
            header_byte = data_runs[pos]
            if header_byte == 0x00:
                break

            len_len = (header_byte >> 4) & 0x0F
            offset_len = header_byte & 0x0F

            if len_len == 0 or offset_len == 0:
                break

            pos += 1 + len_len + offset_len
            fragment_count += 1

        return fragment_count


# 获取随机位置
def GetRandomLocation() -> int:
    return random.randint(1000, 9999)


# 获取随机长度
def GetRandomLength() -> int:
    return random.randint(1000, 9999)


# 主函数：将 db_path 数据导入 db_node
def InsertNodeDataToDB(db_path='../src/db_ntfs_info.db', table_name='db_node'):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    group_id = GetFirstGroupId(cursor)
    user_id = GetFirstUserId(cursor)
    device_id = GetDeviceId(cursor)

    cursor.execute("SELECT ID, Path, Name, ParentID FROM db_path")
    rows = cursor.fetchall()

    for row in rows:
        path_id, full_path, name, parent_id = row

        # 检查是否已存在相同 PathID
        cursor.execute("SELECT COUNT(*) FROM db_node WHERE PathID = ?", (path_id,))
        exists = cursor.fetchone()[0]
        if exists > 0:
            print(f"⚠️ PathID {path_id} 已存在，跳过插入")
            continue

        # 计算字段
        name_hash = hashlib.sha256(name.encode()).hexdigest()
        dir_layer = GetDirLayer(full_path)
        extend_name_id = GetExtendNameId(name, cursor)
        file_size = GetFileSize(full_path)
        file_hash = GetFileHash(full_path)

        # 获取文件的时间属性
        file_times = GetFilesTime(full_path)
        create_time = file_times["FileCreateTime"]
        modify_time = file_times["FileModifyTime"]
        access_time = file_times["FileAccessTime"]
        auth_time = file_times["FileAuthTime"]

        # 新增：根据 $80 属性获取更精确的 ExtentCount
        try:
            attribute_80_data = GetFile80hPattern(full_path)

            if not attribute_80_data or not isinstance(attribute_80_data, list):
                raise ValueError("无效的 80h 属性数据")

            extent_count = GetExtentCount(attribute_80_data)

            print(f"✅ 分片数量为: {extent_count}")

        except Exception as e:
            print(f"⚠️ 获取 ExtentCount 失败，使用默认值 0: {e}")
            extent_count = 0

        # 构建插入语句字段和参数（保持原样）
        fields = [
            'PathID', 'ParentID', 'NameHash', 'PathHash',
            'ExtendNameID', 'DirLayer', 'GroupID', 'UserID',
            'FileCreateTime', 'FileModifyTime', 'FileAccessTime', 'FileAuthTime',
            'FileSize', 'FileMode', 'FileHash', 'ExtentCount'
        ]
        values = [
            path_id, parent_id, name_hash, '',  # PathHash 待填
            extend_name_id, dir_layer, group_id, user_id,
            create_time, modify_time, access_time, auth_time,
            file_size, 'default', file_hash, extent_count
        ]

        # 查询 PathHash（与 db_path.PathHash 一致）
        cursor.execute("SELECT PathHash FROM db_path WHERE ID = ?", (path_id,))
        path_hash_result = cursor.fetchone()
        path_hash = path_hash_result[0] if path_hash_result else ""
        values[3] = path_hash  # 替换 PathHash

        # 处理 Extent 片段字段
        extent_data = []
        for i in range(1, 5):
            if i <= extent_count:
                location = GetRandomLocation()
                length = GetRandomLength()
                extent_data.extend([device_id, location, length])
            else:
                extent_data.extend([None, None, None])

        # 拼接字段和值
        extent_fields = [
            "extent1_DeviceID", "extent1_Location", "extent1_Length",
            "extent2_DeviceID", "extent2_Location", "extent2_Length",
            "extent3_DeviceID", "extent3_Location", "extent3_Length",
            "extent4_DeviceID", "extent4_Location", "extent4_Length"
        ]
        fields += extent_fields
        values += extent_data

        # 构建 SQL 插入语句
        placeholders = ', '.join('?' * len(values))
        insert_sql = f"INSERT INTO {table_name} ({', '.join(fields)}) VALUES ({placeholders})"

        # 执行插入
        cursor.execute(insert_sql, values)

    conn.commit()
    conn.close()
    print(f"✅ 数据已成功插入到 {table_name} 表")


if __name__ == '__main__':
    InsertNodeDataToDB()