optimize db_path memory

This commit is contained in:
Burgess Leo
2025-05-23 18:01:42 +08:00
parent d4a411ce68
commit 491685e892
5 changed files with 48 additions and 46 deletions

View File

@@ -25,10 +25,10 @@ def ClearTableRecordsWithReset(db_path, table_name):
if __name__ == '__main__':
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_path')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_node')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_device')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_config')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_user')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_group')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_extent')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_name')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_device')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_config')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_user')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_group')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_extent')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_name')

View File

@@ -0,0 +1,14 @@
import sqlite3
db_path = "../src/db_ntfs_info.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
def GetDeviceId(cursor: sqlite3.Cursor) -> int:
cursor.execute("SELECT ID FROM db_device ORDER BY ID LIMIT 1")
result = cursor.fetchone()
return result[0] if result else 0
print(GetDeviceId(cursor))

View File

@@ -4,6 +4,7 @@ import sqlite3
from datetime import datetime
from ntfs_utils.mft_analyze import GetFile80hPattern, GetFragmentData, ExtractSequenceHexValues, hex_list_to_int
from ntfs_utils.main import volume_letter
# 工具函数:获取文件扩展名
@@ -131,11 +132,10 @@ def GetFragmentLength(fragment):
# 主函数:将 db_path 数据导入 db_node
def InsertNodeDataToDB(volume_letter: str, db_path='../src/db_ntfs_info.db', table_name='db_node', batch_size=20):
def InsertNodeDataToDB(db_path='../src/db_ntfs_info.db', table_name='db_node', batch_size=20):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
volume_letter = volume_letter.upper().strip()
if len(volume_letter) == 1:
volume_root = f"{volume_letter}:\\"
elif volume_letter.endswith(':'):
@@ -254,5 +254,4 @@ def InsertNodeDataToDB(volume_letter: str, db_path='../src/db_ntfs_info.db', tab
if __name__ == "__main__":
volume_letter_test = "Y"
InsertNodeDataToDB(volume_letter=volume_letter_test)
InsertNodeDataToDB()

View File

@@ -23,7 +23,7 @@ def ShouldSkipPath(path: str) -> bool:
return False
def ScanVolume(volume_letter: str):
def ScanVolume(volume_letter: str) -> list:
"""
完整扫描指定磁盘的所有文件和目录,忽略 NTFS 元文件和系统文件夹,
并为每个节点分配 ParentID。
@@ -33,7 +33,6 @@ def ScanVolume(volume_letter: str):
if not os.path.exists(root_path):
raise ValueError(f"磁盘 {root_path} 不存在")
result = []
path_to_id = {} # 路径 -> ID 映射
counter = 1
@@ -61,15 +60,12 @@ def ScanVolume(volume_letter: str):
name = entry
# 分离盘符并去除开头和结尾的 \
# 分离盘符并处理路径格式
_, relative_path = os.path.splitdrive(full_path)
relative_path = relative_path.lstrip("\\").rstrip("\\")
# 如果是目录,结尾加 /
if os.path.isdir(full_path) and not relative_path.endswith("/"):
relative_path += "/"
# 替换所有 \ -> /
relative_path = relative_path.replace("\\", "/")
path_hash = GenerateHash(relative_path)
@@ -97,25 +93,27 @@ def ScanVolume(volume_letter: str):
"ContentSize": content_size
}
result.append(item)
yield item # 使用 yield 返回每条记录
path_to_id[relative_path] = counter
counter += 1
except Exception as e:
print(f"⚠️ 跳过路径 {full_path},错误: {e}")
return result
def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20):
def InsertPathDataToDB(data_generator, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20):
"""
批量将扫描结果写入数据库。
流式写入数据库,边扫描边入库。
:param data_generator: 可迭代对象(如生成器)
:param db_path: 数据库路径
:param table_name: 表名
:param batch_size: 每多少条记录提交一次
"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
try:
# 创建表(如果不存在)
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {table_name} (
ID INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -125,23 +123,20 @@ def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_pa
IsDir INTEGER NOT NULL CHECK(IsDir IN (0, 1)),
ParentID INTEGER,
ContentSize INTEGER,
FOREIGN KEY(ParentID) REFERENCES {table_name}(ID)
);
"""
cursor.execute(create_table_sql)
# 插入语句(忽略重复 PathHash
insert_sql = f"""
INSERT OR IGNORE INTO {table_name}
(Path, Name, PathHash, IsDir, ParentID, ContentSize)
VALUES (?, ?, ?, ?, ?, ?)
"""
total_inserted = 0
batch = []
for item in data:
for item in data_generator:
batch.append((
item['Path'],
item['Name'],
@@ -154,39 +149,34 @@ def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_pa
if len(batch) >= batch_size:
cursor.executemany(insert_sql, batch)
conn.commit()
total_inserted += cursor.rowcount
print(f"✅ 提交一批 {len(batch)} 条数据")
batch.clear()
# 插入剩余数据
# 提交剩余不足一批的数据
if batch:
cursor.executemany(insert_sql, batch)
conn.commit()
total_inserted += cursor.rowcount
print(f"✅ 提交最后一批 {len(batch)} 条数据")
print(f"✅ 总共插入 {total_inserted} 条记录到数据库。")
except Exception as e:
print(f"❌ 插入失败: {e}")
conn.rollback()
finally:
conn.close()
# 示例主函数
def main():
volume_letter = "Y"
def DBPathMain(volume_letter: str):
print(f"🔍 开始全盘扫描磁盘 {volume_letter}:\\ ...")
scanned_data = ScanVolume(volume_letter)
print(f"📊 共扫描到 {len(scanned_data)} 条有效记录,开始入库...")
InsertPathDataToDB(scanned_data)
# 获取生成器对象
generator = ScanVolume(volume_letter)
print(f"📊 开始逐批入库...")
InsertPathDataToDB(generator)
print("✅ 全盘扫描与入库完成")
if __name__ == "__main__":
main()
DBPathMain(volume_letter="Y")

View File

@@ -2,8 +2,8 @@ from ntfs_utils.db_config import GetNTFSBootInfo, InsertInfoToDBConfig
from ntfs_utils.db_device import ScanSpecialVolumes, InsertVolumesToDB
from ntfs_utils.db_extend_name import InsertExtensionsToDB
from ntfs_utils.db_group import InsertGroupToDB
from ntfs_utils.db_node import InsertNodeDataToDB
from ntfs_utils.db_path import ScanVolume, InsertPathDataToDB
# from ntfs_utils.db_node import InsertNodeDataToDB
from ntfs_utils.db_path import DBPathMain
from ntfs_utils.db_user import InsertUserToDB
volume_letter = 'Y'
@@ -26,10 +26,6 @@ def main():
group_name_list = ["Copier"]
InsertGroupToDB(group_name_list)
# 初始化 db_path 表
scanned_data = ScanVolume(volume_letter)
InsertPathDataToDB(scanned_data)
# 初始化 db_extend_name 表
common_extensions = [
"txt", "log", "csv", "xls", "xlsx", "doc", "docx",
@@ -41,8 +37,11 @@ def main():
count = InsertExtensionsToDB(common_extensions)
print(f"共插入 {count} 个新扩展名。")
# 初始化 db_path 表
DBPathMain(volume_letter=volume_letter)
# 初始化 db_node 表
InsertNodeDataToDB(volume_letter)
# InsertNodeDataToDB(volume_letter)
if __name__ == '__main__':