Commit 5a2cce07 authored by uuo00_n's avatar uuo00_n

feat(敏感词): 增强敏感词分类和严重程度管理

添加敏感词分类系统和严重程度分级,支持批量导入和多维度筛选
扩展敏感词检测功能,记录详细触发信息
更新API接口支持分类管理和筛选功能
完善文档说明数据库结构和分类系统
parent 2612a04e
......@@ -11,7 +11,9 @@ LLM-Filter 是一个基于大型语言模型(LLM)的智能对话系统,集成
- **用户管理**:支持用户注册、登录和权限管理
- **对话历史**:保存和管理用户的对话历史记录
- **敏感词管理**:提供敏感词的添加、删除和查询功能
- **敏感记录追踪**:记录并可查询敏感词触发情况
- **敏感词分类**:支持敏感词分类和子分类管理,可按类别筛选
- **批量导入**:支持批量导入敏感词,提高管理效率
- **敏感记录追踪**:记录并可查询敏感词触发情况,支持多维度筛选
## 系统架构
......@@ -31,6 +33,61 @@ LLM-Filter 是一个基于大型语言模型(LLM)的智能对话系统,集成
- **工具层**:提供敏感词过滤等功能
- **数据库层**:处理数据持久化
### 数据库结构
系统使用MongoDB作为数据库,包含以下集合:
#### 1. users 集合
用户信息存储,包含字段:
- `_id`: 用户唯一标识
- `username`: 用户名
- `email`: 电子邮箱
- `hashed_password`: 加密后的密码
- `role`: 用户角色,可为 "user" 或 "admin"
- `created_at`: 创建时间
- `updated_at`: 更新时间
#### 2. conversations 集合
对话信息存储,包含字段:
- `_id`: 对话唯一标识
- `user_id`: 关联的用户ID
- `messages`: 消息列表,每条消息包含:
- `role`: 消息角色,可为 "user" 或 "assistant"
- `content`: 消息内容
- `timestamp`: 消息时间戳
- `contains_sensitive_words`: 是否包含敏感词
- `sensitive_words_found`: 发现的敏感词列表
- `created_at`: 创建时间
- `updated_at`: 更新时间
#### 3. sensitive_words 集合
敏感词信息存储,包含字段:
- `_id`: 敏感词唯一标识
- `word`: 敏感词内容
- `category`: 敏感词主分类(如"违法活动"、"不良内容"等)
- `subcategory`: 敏感词子分类(如"赌博"、"自杀"等)
- `severity`: 严重程度(1-5级,5为最严重)
- `created_at`: 创建时间
- `updated_at`: 更新时间
#### 4. sensitive_records 集合
敏感词检测记录,包含字段:
- `_id`: 记录唯一标识
- `user_id`: 关联的用户ID
- `conversation_id`: 关联的对话ID
- `message_content`: 触发检测的消息内容
- `sensitive_words_found`: 发现的敏感词详细信息列表,每项包含:
- `word`: 敏感词
- `category`: 主分类
- `subcategory`: 子分类
- `severity`: 严重程度
- `highest_severity`: 记录中最高的严重程度
- `timestamp`: 记录时间
## 安装与配置
### 环境要求
......@@ -64,6 +121,28 @@ LLM-Filter 是一个基于大型语言模型(LLM)的智能对话系统,集成
OLLAMA_API_BASE_URL=http://localhost:11434
OLLAMA_MODEL=llama2
```
**生成安全的SECRET_KEY**
为了确保系统安全,请使用以下方法生成一个强随机密钥:
```python
# 在Python终端中运行
import secrets
print(secrets.token_hex(32)) # 生成一个64字符的随机十六进制字符串
```
或者使用命令行:
```bash
# Linux/Mac
openssl rand -hex 32
# 或者
python -c "import secrets; print(secrets.token_hex(32))"
```
将生成的密钥复制到`.env`文件的`SECRET_KEY`变量中。
4. 初始化数据库
```bash
......@@ -94,8 +173,8 @@ LLM-Filter 是一个基于大型语言模型(LLM)的智能对话系统,集成
- `POST /api/v1/sensitive-words` - 添加敏感词
- `DELETE /api/v1/sensitive-words/{word_id}` - 删除敏感词
- `GET /api/v1/sensitive-words` - 获取所有敏感词
- `GET /api/v1/sensitive-records` - 获取敏感词记录
- `GET /api/v1/sensitive-words` - 获取所有敏感词(支持按类别、子类别和严重程度筛选)
- `GET /api/v1/sensitive-records` - 获取敏感词记录(支持按用户、对话、时间范围、类别、子类别和严重程度筛选)
## 项目结构
......@@ -124,6 +203,22 @@ llm-filter/
- `TrieNode` 类:实现 Trie 树的节点结构
- `SensitiveWordFilter` 类:提供敏感词加载和检测功能
#### 敏感词分类系统
敏感词采用多层分类体系,便于管理和筛选:
1. **主分类**:包括违法活动、不良内容、政治内容、歧视言论、暴力内容、色情内容、毒品相关、赌博相关、诈骗相关等
2. **子分类**:每个主分类下设多个子分类,如:
- 违法活动:贩毒、赌博、诈骗、传销等
- 不良内容:自杀、自残、暴力、血腥等
- 歧视言论:种族歧视、性别歧视、地域歧视等
3. **严重程度**:1-5级,5级为最严重
这种分类系统使管理员能够:
- 精确定位和管理敏感内容
- 按类别和严重程度筛选敏感记录
- 针对不同类型的敏感内容制定不同的处理策略
### 对话服务
对话功能通过以下组件实现:
......
from fastapi import APIRouter, Depends, HTTPException, status
from typing import List, Optional
from fastapi import APIRouter, Depends, HTTPException, status, File, UploadFile
from typing import List, Optional, Dict
from datetime import datetime
import json
import csv
import io
from app.api.deps import get_current_admin_user
from app.schemas.sensitive_word import SensitiveWordCreate, SensitiveWordResponse, SensitiveRecordResponse
from app.services.sensitive_word import add_sensitive_word, delete_sensitive_word, get_all_sensitive_words, get_sensitive_records
from app.schemas.sensitive_word import (
SensitiveWordCreate, SensitiveWordResponse, SensitiveRecordResponse,
SensitiveWordBulkImport, CategoryCreate, CategoryResponse, CategoriesResponse
)
from app.services.sensitive_word import (
add_sensitive_word, delete_sensitive_word, get_all_sensitive_words,
get_sensitive_records, get_categories, add_category, update_category,
delete_category, bulk_import_sensitive_words
)
from app.models.sensitive_word import SENSITIVE_WORD_CATEGORIES, SENSITIVE_WORD_SUBCATEGORIES
router = APIRouter()
......@@ -13,9 +24,68 @@ async def create_sensitive_word(
_: dict = Depends(get_current_admin_user)
):
"""添加敏感词(仅管理员)"""
word_id = await add_sensitive_word(word_data.word, word_data.category)
word_id = await add_sensitive_word(
word_data.word,
word_data.category,
word_data.subcategory,
word_data.severity
)
return {"id": word_id}
@router.post("/sensitive-words/bulk", response_model=dict, status_code=status.HTTP_201_CREATED)
async def bulk_create_sensitive_words(
words_data: SensitiveWordBulkImport,
_: dict = Depends(get_current_admin_user)
):
"""批量添加敏感词(仅管理员)"""
count = await bulk_import_sensitive_words(words_data.words)
return {"imported_count": count}
@router.post("/sensitive-words/import", status_code=status.HTTP_201_CREATED)
async def import_sensitive_words_from_file(
file: UploadFile = File(...),
_: dict = Depends(get_current_admin_user)
):
"""从文件导入敏感词(仅管理员)
支持CSV和JSON格式:
- CSV格式: word,category,subcategory,severity
- JSON格式: 包含word, category, subcategory, severity字段的对象数组
"""
content = await file.read()
words = []
if file.filename.endswith('.csv'):
# 处理CSV文件
csv_content = io.StringIO(content.decode('utf-8'))
reader = csv.DictReader(csv_content)
for row in reader:
severity = int(row.get('severity', 1)) if row.get('severity') else 1
words.append(SensitiveWordCreate(
word=row['word'],
category=row['category'],
subcategory=row.get('subcategory'),
severity=severity
))
elif file.filename.endswith('.json'):
# 处理JSON文件
data = json.loads(content)
for item in data:
words.append(SensitiveWordCreate(
word=item['word'],
category=item['category'],
subcategory=item.get('subcategory'),
severity=item.get('severity', 1)
))
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="仅支持CSV和JSON格式文件"
)
count = await bulk_import_sensitive_words(words)
return {"imported_count": count}
@router.delete("/sensitive-words/{word_id}", status_code=status.HTTP_204_NO_CONTENT)
async def remove_sensitive_word(
word_id: str,
......@@ -32,17 +102,104 @@ async def remove_sensitive_word(
@router.get("/sensitive-words", response_model=List[SensitiveWordResponse])
async def list_sensitive_words(
category: Optional[str] = None,
subcategory: Optional[str] = None,
min_severity: Optional[int] = None,
max_severity: Optional[int] = None,
_: dict = Depends(get_current_admin_user)
):
"""获取所有敏感词(仅管理员)"""
return await get_all_sensitive_words()
"""获取所有敏感词(仅管理员)
可选筛选参数:
- category: 主分类
- subcategory: 子分类
- min_severity: 最小严重程度
- max_severity: 最大严重程度
"""
return await get_all_sensitive_words(category, subcategory, min_severity, max_severity)
@router.get("/sensitive-records", response_model=List[SensitiveRecordResponse])
async def list_sensitive_records(
user_id: Optional[str] = None,
conversation_id: Optional[str] = None,
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
category: Optional[str] = None,
subcategory: Optional[str] = None,
min_severity: Optional[int] = None,
max_severity: Optional[int] = None,
_: dict = Depends(get_current_admin_user)
):
"""获取敏感词记录(仅管理员)
可选筛选参数:
- user_id: 用户ID
- conversation_id: 对话ID
- start_date: 开始日期
- end_date: 结束日期
- category: 主分类
- subcategory: 子分类
- min_severity: 最小严重程度
- max_severity: 最大严重程度
"""
return await get_sensitive_records(
user_id, conversation_id, start_date, end_date,
category, subcategory, min_severity, max_severity
)
@router.get("/categories", response_model=CategoriesResponse)
async def list_categories(
_: dict = Depends(get_current_admin_user)
):
"""获取所有敏感词分类(仅管理员)"""
return {"categories": await get_categories()}
@router.get("/categories/default", response_model=CategoriesResponse)
async def get_default_categories(
_: dict = Depends(get_current_admin_user)
):
"""获取默认敏感词分类(仅管理员)"""
return {"categories": {cat: SENSITIVE_WORD_SUBCATEGORIES.get(cat, []) for cat in SENSITIVE_WORD_CATEGORIES}}
@router.post("/categories", response_model=CategoryResponse, status_code=status.HTTP_201_CREATED)
async def create_category(
category_data: CategoryCreate,
_: dict = Depends(get_current_admin_user)
):
"""获取敏感词记录(仅管理员)"""
return await get_sensitive_records(user_id, start_date, end_date)
\ No newline at end of file
"""添加敏感词分类(仅管理员)"""
success = await add_category(category_data.name, category_data.subcategories)
if not success:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="分类已存在"
)
return {"name": category_data.name, "subcategories": category_data.subcategories}
@router.put("/categories/{category_name}", response_model=CategoryResponse)
async def update_category_subcategories(
category_name: str,
subcategories: List[str],
_: dict = Depends(get_current_admin_user)
):
"""更新敏感词分类的子分类(仅管理员)"""
success = await update_category(category_name, subcategories)
if not success:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="分类不存在"
)
return {"name": category_name, "subcategories": subcategories}
@router.delete("/categories/{category_name}", status_code=status.HTTP_204_NO_CONTENT)
async def remove_category(
category_name: str,
_: dict = Depends(get_current_admin_user)
):
"""删除敏感词分类(仅管理员)"""
success = await delete_category(category_name)
if not success:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="分类不存在或无法删除默认分类"
)
return None
\ No newline at end of file
......@@ -4,11 +4,41 @@ from pydantic import BaseModel, Field
from bson import ObjectId
from app.models.user import PyObjectId
# 敏感词类型枚举(可扩展)
SENSITIVE_WORD_CATEGORIES = [
"违法活动",
"不良内容",
"政治内容",
"歧视言论",
"暴力内容",
"色情内容",
"毒品相关",
"赌博相关",
"诈骗相关",
"其他"
]
# 敏感词子类型枚举(可扩展)
SENSITIVE_WORD_SUBCATEGORIES = {
"违法活动": ["贩毒", "赌博", "诈骗", "传销", "其他违法"],
"不良内容": ["自杀", "自残", "暴力", "血腥", "其他不良"],
"政治内容": ["敏感人物", "敏感事件", "敏感地区", "其他政治"],
"歧视言论": ["种族歧视", "性别歧视", "地域歧视", "其他歧视"],
"暴力内容": ["肢体暴力", "语言暴力", "恐怖主义", "其他暴力"],
"色情内容": ["露骨描述", "性暗示", "色情服务", "其他色情"],
"毒品相关": ["毒品名称", "制毒方法", "吸毒工具", "其他毒品"],
"赌博相关": ["赌博方式", "赌博平台", "赌博工具", "其他赌博"],
"诈骗相关": ["电信诈骗", "网络诈骗", "金融诈骗", "其他诈骗"],
"其他": ["未分类"]
}
# 敏感词模型
class SensitiveWordModel(BaseModel):
id: PyObjectId = Field(default_factory=PyObjectId, alias="_id")
word: str
category: Optional[str] = None
category: str = Field(..., description="敏感词主分类")
subcategory: Optional[str] = Field(None, description="敏感词子分类")
severity: Optional[int] = Field(1, description="严重程度 1-5,5为最严重", ge=1, le=5)
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
......@@ -17,13 +47,21 @@ class SensitiveWordModel(BaseModel):
arbitrary_types_allowed = True
json_encoders = {ObjectId: str}
# 敏感词详细信息
class SensitiveWordInfo(BaseModel):
word: str
category: str
subcategory: Optional[str] = None
severity: Optional[int] = 1
# 敏感词记录模型
class SensitiveRecordModel(BaseModel):
id: PyObjectId = Field(default_factory=PyObjectId, alias="_id")
user_id: PyObjectId
conversation_id: PyObjectId
message_content: str
sensitive_words_found: List[str]
sensitive_words_found: List[SensitiveWordInfo] # 使用详细信息替代简单字符串列表
highest_severity: int = 1 # 记录中最高的严重程度
timestamp: datetime = Field(default_factory=datetime.now)
class Config:
......
from typing import List, Optional
from pydantic import BaseModel
from typing import List, Optional, Dict
from pydantic import BaseModel, Field
from datetime import datetime
class SensitiveWordCreate(BaseModel):
word: str
category: Optional[str] = None
category: str
subcategory: Optional[str] = None
severity: Optional[int] = Field(1, ge=1, le=5)
class SensitiveWordBulkImport(BaseModel):
words: List[SensitiveWordCreate]
class SensitiveWordInfoResponse(BaseModel):
word: str
category: str
subcategory: Optional[str] = None
severity: Optional[int] = 1
class SensitiveWordResponse(BaseModel):
id: str
word: str
category: Optional[str] = None
category: str
subcategory: Optional[str] = None
severity: Optional[int] = 1
created_at: datetime
updated_at: Optional[datetime] = None
class SensitiveRecordResponse(BaseModel):
id: str
user_id: str
conversation_id: str
message_content: str
sensitive_words_found: List[str]
timestamp: datetime
\ No newline at end of file
sensitive_words_found: List[SensitiveWordInfoResponse]
highest_severity: int = 1
timestamp: datetime
class CategoryCreate(BaseModel):
name: str
subcategories: List[str] = []
class CategoryResponse(BaseModel):
name: str
subcategories: List[str]
class CategoriesResponse(BaseModel):
categories: Dict[str, List[str]]
\ No newline at end of file
......@@ -43,7 +43,10 @@ async def add_message(conversation_id: str, user_id: str, content: str) -> Dict[
Dict: 包含处理结果的字典
"""
# 检查敏感词
contains_sensitive, sensitive_words = sensitive_word_filter.check_text(content)
check_result = sensitive_word_filter.check_text(content)
contains_sensitive = check_result["contains_sensitive_words"]
sensitive_words = check_result["sensitive_words_found"]
highest_severity = check_result["highest_severity"]
# 创建用户消息
user_message = {
......@@ -51,7 +54,8 @@ async def add_message(conversation_id: str, user_id: str, content: str) -> Dict[
"content": content,
"timestamp": datetime.now(),
"contains_sensitive_words": contains_sensitive,
"sensitive_words_found": sensitive_words
"sensitive_words_found": sensitive_words,
"highest_severity": highest_severity
}
# 更新对话
......@@ -71,6 +75,7 @@ async def add_message(conversation_id: str, user_id: str, content: str) -> Dict[
"conversation_id": ObjectId(conversation_id),
"message_content": content,
"sensitive_words_found": sensitive_words,
"highest_severity": highest_severity,
"timestamp": datetime.now()
}
......
This diff is collapsed.
from typing import List, Set, Dict, Tuple
from typing import List, Set, Dict, Tuple, Any
from app.db.mongodb import db
class TrieNode:
......@@ -6,15 +6,16 @@ class TrieNode:
def __init__(self):
self.children = {}
self.is_end_of_word = False
self.word_info = None # 存储敏感词的完整信息
class SensitiveWordFilter:
def __init__(self):
self.root = TrieNode()
self.sensitive_words = set()
self.sensitive_words = {} # 改为字典,存储敏感词及其信息
async def load_sensitive_words(self):
"""从数据库加载敏感词"""
self.sensitive_words = set()
self.sensitive_words = {}
self.root = TrieNode()
# 从数据库获取敏感词
......@@ -22,19 +23,28 @@ class SensitiveWordFilter:
async for document in cursor:
word = document.get("word", "")
if word:
self.sensitive_words.add(word)
self._add_to_trie(word)
# 提取敏感词的完整信息
word_info = {
"id": str(document.get("_id")),
"word": word,
"category": document.get("category"),
"subcategory": document.get("subcategory"),
"severity": document.get("severity", 1)
}
self.sensitive_words[word] = word_info
self._add_to_trie(word, word_info)
def _add_to_trie(self, word: str):
"""将敏感词添加到Trie树中"""
def _add_to_trie(self, word: str, word_info: Dict[str, Any]):
"""将敏感词添加到Trie树中,并存储其完整信息"""
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_end_of_word = True
node.word_info = word_info
def check_text(self, text: str) -> Tuple[bool, List[str]]:
def check_text(self, text: str) -> Dict[str, Any]:
"""
检查文本是否包含敏感词
......@@ -42,12 +52,21 @@ class SensitiveWordFilter:
text: 要检查的文本
Returns:
Tuple[bool, List[str]]: (是否包含敏感词, 找到的敏感词列表)
Dict[str, Any]: {
"contains_sensitive_words": bool,
"sensitive_words_found": List[Dict],
"highest_severity": int
}
"""
if not text:
return False, []
return {
"contains_sensitive_words": False,
"sensitive_words_found": [],
"highest_severity": 0
}
found_words = []
highest_severity = 0
text_lower = text.lower() # 转为小写进行匹配
# 遍历文本的每个字符作为起点
......@@ -63,12 +82,23 @@ class SensitiveWordFilter:
node = node.children[char]
# 如果到达某个敏感词的结尾
if node.is_end_of_word:
word = text_lower[i:j+1]
found_words.append(word)
if node.is_end_of_word and node.word_info:
# 使用原始敏感词信息
word_info = node.word_info.copy()
found_words.append(word_info)
# 更新最高严重程度
severity = word_info.get("severity", 1)
if severity > highest_severity:
highest_severity = severity
break
return len(found_words) > 0, found_words
return {
"contains_sensitive_words": len(found_words) > 0,
"sensitive_words_found": found_words,
"highest_severity": highest_severity
}
# 创建全局敏感词过滤器实例
sensitive_word_filter = SensitiveWordFilter()
\ No newline at end of file
......@@ -56,42 +56,80 @@ async def init_db():
{
"word": "赌博",
"category": "违法活动",
"subcategory": "赌博",
"severity": 3,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "色情",
"category": "违法活动",
"category": "色情内容",
"subcategory": "色情服务",
"severity": 4,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "毒品",
"category": "违法活动",
"category": "毒品相关",
"subcategory": "毒品名称",
"severity": 5,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "诈骗",
"category": "违法活动",
"category": "诈骗相关",
"subcategory": "网络诈骗",
"severity": 4,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "暴力",
"category": "不良内容",
"category": "暴力内容",
"subcategory": "语言暴力",
"severity": 3,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "自杀",
"category": "不良内容",
"subcategory": "自杀",
"severity": 5,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "政治敏感",
"category": "政治内容",
"subcategory": "敏感事件",
"severity": 4,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "种族歧视",
"category": "歧视言论",
"subcategory": "种族歧视",
"severity": 4,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "性别歧视",
"category": "歧视言论",
"subcategory": "性别歧视",
"severity": 3,
"created_at": datetime.now(),
"updated_at": datetime.now()
},
{
"word": "恐怖主义",
"category": "暴力内容",
"subcategory": "恐怖主义",
"severity": 5,
"created_at": datetime.now(),
"updated_at": datetime.now()
}
......@@ -133,10 +171,39 @@ async def init_db():
# 创建敏感词记录集合并添加假数据
sensitive_records = [
{
"user_id": user_id,
"conversation_id": conversation_id,
"message_content": "我想了解一下赌博的方法",
"sensitive_words_found": ["赌博"],
"user_id": "user123",
"conversation_id": "conv123",
"message_content": "我想了解一下赌博的事情",
"sensitive_words_found": [
{
"word": "赌博",
"category": "违法活动",
"subcategory": "赌博",
"severity": 3
}
],
"highest_severity": 3,
"timestamp": datetime.now()
},
{
"user_id": "user123",
"conversation_id": "conv456",
"message_content": "如何获取毒品和色情内容",
"sensitive_words_found": [
{
"word": "毒品",
"category": "毒品相关",
"subcategory": "毒品名称",
"severity": 5
},
{
"word": "色情",
"category": "色情内容",
"subcategory": "色情服务",
"severity": 4
}
],
"highest_severity": 5,
"timestamp": datetime.now()
}
]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment