179 lines
5.2 KiB
Python
179 lines
5.2 KiB
Python
import hashlib
|
||
import os
|
||
import sqlite3
|
||
|
||
|
||
def GenerateHash(s: str) -> str:
|
||
"""
|
||
对输入字符串生成 SHA-256 哈希值。
|
||
用于唯一标识一个路径(PathHash)。
|
||
"""
|
||
return hashlib.sha256(s.encode('utf-8')).hexdigest()
|
||
|
||
|
||
def ShouldSkipPath(path: str) -> bool:
|
||
"""
|
||
判断是否应跳过该路径(NTFS元文件或系统文件夹)。
|
||
"""
|
||
name = os.path.basename(path)
|
||
if name.startswith('$'):
|
||
return True
|
||
if name == "System Volume Information":
|
||
return True
|
||
return False
|
||
|
||
|
||
def ScanVolume(volume_letter: str):
|
||
"""
|
||
完整扫描指定磁盘的所有文件和目录,忽略 NTFS 元文件和系统文件夹,
|
||
并为每个节点分配 ParentID。
|
||
|
||
返回:
|
||
list of dict:包含文件/目录信息的字典列表
|
||
"""
|
||
root_path = f"{volume_letter.upper()}:\\"
|
||
if not os.path.exists(root_path):
|
||
raise ValueError(f"磁盘 {root_path} 不存在")
|
||
|
||
result = []
|
||
path_to_id = {} # 用于记录路径到数据库 ID 的映射
|
||
counter = 1 # 模拟数据库自增 ID
|
||
|
||
for root, dirs, files in os.walk(root_path, topdown=True, onerror=None, followlinks=False):
|
||
# 过滤掉需要跳过的目录
|
||
dirs[:] = [d for d in dirs if not ShouldSkipPath(os.path.join(root, d))]
|
||
|
||
for entry in files + dirs:
|
||
full_path = os.path.join(root, entry)
|
||
|
||
if ShouldSkipPath(full_path):
|
||
continue
|
||
|
||
try:
|
||
if os.path.isdir(full_path):
|
||
is_dir = 1
|
||
bytes_size = 0
|
||
elif os.path.isfile(full_path):
|
||
is_dir = 0
|
||
bytes_size = os.path.getsize(full_path)
|
||
else:
|
||
continue
|
||
|
||
name = entry
|
||
|
||
# ✅ 修正点:对 Path 字段进行哈希
|
||
path_hash = GenerateHash(full_path)
|
||
|
||
# 计算 ContentSize(KB),小文件至少显示为 1 KB
|
||
content_size = bytes_size // 1024
|
||
if content_size == 0 and bytes_size > 0:
|
||
content_size = 1
|
||
|
||
# 获取父目录路径
|
||
parent_path = os.path.dirname(full_path)
|
||
parent_id = path_to_id.get(parent_path, 0) # 默认为 0(根目录可能未录入)
|
||
|
||
item = {
|
||
"ID": counter,
|
||
"Path": full_path,
|
||
"Name": name,
|
||
"PathHash": path_hash,
|
||
"IsDir": is_dir,
|
||
"ParentID": parent_id,
|
||
"ContentSize": content_size
|
||
}
|
||
|
||
result.append(item)
|
||
path_to_id[full_path] = counter
|
||
counter += 1
|
||
|
||
except Exception as e:
|
||
print(f"⚠️ 跳过路径 {full_path},错误: {e}")
|
||
|
||
return result
|
||
|
||
|
||
def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20):
|
||
"""
|
||
批量将扫描结果写入数据库。
|
||
"""
|
||
conn = sqlite3.connect(db_path)
|
||
cursor = conn.cursor()
|
||
|
||
try:
|
||
# 创建表(如果不存在)
|
||
create_table_sql = f"""
|
||
CREATE TABLE IF NOT EXISTS {table_name} (
|
||
ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
Path TEXT NOT NULL,
|
||
Name TEXT NOT NULL,
|
||
PathHash TEXT UNIQUE NOT NULL,
|
||
IsDir INTEGER NOT NULL CHECK(IsDir IN (0, 1)),
|
||
ParentID INTEGER,
|
||
ContentSize INTEGER,
|
||
|
||
FOREIGN KEY(ParentID) REFERENCES {table_name}(ID)
|
||
);
|
||
"""
|
||
cursor.execute(create_table_sql)
|
||
|
||
# 插入语句(忽略重复 PathHash)
|
||
insert_sql = f"""
|
||
INSERT OR IGNORE INTO {table_name}
|
||
(Path, Name, PathHash, IsDir, ParentID, ContentSize)
|
||
VALUES (?, ?, ?, ?, ?, ?)
|
||
"""
|
||
|
||
total_inserted = 0
|
||
batch = []
|
||
|
||
for item in data:
|
||
batch.append((
|
||
item['Path'],
|
||
item['Name'],
|
||
item['PathHash'],
|
||
item['IsDir'],
|
||
item['ParentID'] or 0,
|
||
item['ContentSize']
|
||
))
|
||
|
||
if len(batch) >= batch_size:
|
||
cursor.executemany(insert_sql, batch)
|
||
conn.commit()
|
||
total_inserted += cursor.rowcount
|
||
print(f"✅ 提交一批 {len(batch)} 条数据")
|
||
batch.clear()
|
||
|
||
# 插入剩余数据
|
||
if batch:
|
||
cursor.executemany(insert_sql, batch)
|
||
conn.commit()
|
||
total_inserted += cursor.rowcount
|
||
print(f"✅ 提交最后一批 {len(batch)} 条数据")
|
||
|
||
print(f"✅ 总共插入 {total_inserted} 条记录到数据库。")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 插入失败: {e}")
|
||
conn.rollback()
|
||
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
# 示例主函数
|
||
def main():
|
||
volume_letter = "Z"
|
||
|
||
print(f"🔍 开始全盘扫描磁盘 {volume_letter}:\\ ...")
|
||
scanned_data = ScanVolume(volume_letter)
|
||
|
||
print(f"📊 共扫描到 {len(scanned_data)} 条有效记录,开始入库...")
|
||
InsertPathDataToDB(scanned_data)
|
||
|
||
print("✅ 全盘扫描与入库完成")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|