x
This commit is contained in:
commit
4af0f9c4a1
3
.idea/.gitignore
vendored
Normal file
3
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
9
.idea/misc.xml
Normal file
9
.idea/misc.xml
Normal file
@ -0,0 +1,9 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.12 (llama_factory_local)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="Python 3.12 (llama_factory_local)" project-jdk-type="Python SDK">
|
||||
<output url="file://$PROJECT_DIR$/out" />
|
||||
</component>
|
||||
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/offline_data_model_pipline.iml" filepath="$PROJECT_DIR$/.idea/offline_data_model_pipline.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
15
.idea/offline_data_model_pipline.iml
Normal file
15
.idea/offline_data_model_pipline.iml
Normal file
@ -0,0 +1,15 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="FacetManager">
|
||||
<facet type="Python" name="Python">
|
||||
<configuration sdkName="Python 3.12 (llama_factory_local)" />
|
||||
</facet>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="Python 3.12 (llama_factory_local) interpreter library" level="application" />
|
||||
</component>
|
||||
</module>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
154
data_generate/zw12345/baogao_content_extract.py
Normal file
154
data_generate/zw12345/baogao_content_extract.py
Normal file
@ -0,0 +1,154 @@
|
||||
"""
|
||||
政务12345全国数据生成系统
|
||||
功能:
|
||||
1. 支持全国范围地理位置生成
|
||||
2. 多层级分类扩展
|
||||
3. 数据保存至Excel
|
||||
4. 真实业务场景模拟
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from openai import OpenAI
|
||||
import random
|
||||
import time
|
||||
import re
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
client = OpenAI(api_key="your-api-key")
|
||||
|
||||
class NationalDataGenerator:
|
||||
def __init__(self, excel_path: str, category_column: str):
|
||||
self.base_categories = self._load_excel_categories(excel_path, category_column)
|
||||
self.location_pool = self._generate_national_locations()
|
||||
self.expanded_categories = self._expand_categories_with_gpt()
|
||||
self.used_records = set()
|
||||
|
||||
def _load_excel_categories(self, path: str, column: str) -> List[str]:
|
||||
"""从Excel读取基础分类"""
|
||||
df = pd.read_excel(path)
|
||||
return df[column].dropna().unique().tolist()
|
||||
|
||||
def _generate_national_locations(self, num=200) -> List[str]:
|
||||
"""生成全国真实地理位置库"""
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4-turbo",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": f"生成{num}个中国各城市真实存在的地理位置,按省市区三级格式,示例:\n- 广东省广州市天河区珠江新城\n- 浙江省杭州市余杭区未来科技城"
|
||||
}]
|
||||
)
|
||||
return [line.split(" ")[1] for line in response.choices[0].message.content.strip().split("\n")]
|
||||
|
||||
def _expand_categories_with_gpt(self) -> Dict[str, List[str]]:
|
||||
"""GPT扩展分类体系"""
|
||||
category_map = {}
|
||||
for base_cat in self.base_categories:
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4-turbo",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": f"生成与【{base_cat}】相关但具有政务场景区分度的5个细分类型,示例:\n- 类型1:施工许可违规\n- 类型2:夜间施工超时"
|
||||
}]
|
||||
)
|
||||
sub_cats = [re.sub(r":.*", "", line.split(" ")[1])
|
||||
for line in response.choices[0].message.content.strip().split("\n")]
|
||||
category_map[base_cat] = sub_cats
|
||||
time.sleep(1)
|
||||
return category_map
|
||||
|
||||
def generate_dataset(self, num_records: int) -> pd.DataFrame:
|
||||
"""生成核心数据集"""
|
||||
data = []
|
||||
while len(data) < num_records:
|
||||
base_cat = random.choice(self.base_categories)
|
||||
sub_cat = random.choice(self.expanded_categories[base_cat])
|
||||
location = random.choice(self.location_pool)
|
||||
|
||||
content, keywords = self._generate_content(base_cat, sub_cat, location)
|
||||
if content and self._validate_record(content, keywords, base_cat):
|
||||
data.append({
|
||||
"ID": len(data)+1,
|
||||
"内容": content,
|
||||
"关键词": " ".join(keywords),
|
||||
"参考答案": base_cat,
|
||||
"细分类型": sub_cat,
|
||||
"地理位置": location
|
||||
})
|
||||
time.sleep(1.2)
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
def _generate_content(self, base_cat: str, sub_cat: str, location: str) -> Tuple[str, List[str]]:
|
||||
"""生成政务工单内容"""
|
||||
prompt = f"""生成真实可信的12345政务工单,要求:
|
||||
1. 主分类:【{base_cat}】
|
||||
2. 细分类型:【{sub_cat}】
|
||||
3. 发生地点:【{location}】
|
||||
4. 包含要素:时间、具体问题、影响范围、市民诉求
|
||||
5. 生成5个关键词(必须包含{base_cat})
|
||||
6. 内容长度80-150字
|
||||
|
||||
示例格式:
|
||||
市民反映{location}某建筑工地违规夜间施工至凌晨,噪音严重干扰周边居民。已向环保部门投诉3次未解决,要求立即停工整顿。
|
||||
关键词:夜间施工 噪音污染 环保投诉 施工许可 居民维权"""
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "你是政务数据生成专家"},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.7,
|
||||
max_tokens=600
|
||||
)
|
||||
raw_text = response.choices[0].message.content.strip()
|
||||
return self._parse_generated_text(raw_text)
|
||||
except Exception as e:
|
||||
print(f"生成失败:{str(e)}")
|
||||
return None, []
|
||||
|
||||
def _parse_generated_text(self, text: str) -> Tuple[str, List[str]]:
|
||||
"""解析生成文本"""
|
||||
content = re.sub(r"关键词:.*", "", text).strip()
|
||||
keywords = re.findall(r"关键词:(.+)", text)[0].split()[:5]
|
||||
return content, keywords
|
||||
|
||||
def _validate_record(self, content: str, keywords: List[str], category: str) -> bool:
|
||||
"""五重数据校验"""
|
||||
return (
|
||||
len(content) >= 80 and
|
||||
len(keywords) == 5 and
|
||||
category in keywords and
|
||||
content not in self.used_records and
|
||||
any(c.isdigit() for c in content) # 包含数字要素
|
||||
)
|
||||
|
||||
|
||||
# 输入文件示例(input.xlsx)
|
||||
"""
|
||||
| 基础分类 |
|
||||
|--------------|
|
||||
| 施工管理 |
|
||||
| 消费维权 |
|
||||
| 城市管理 |
|
||||
| 公共服务 |
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 初始化生成器
|
||||
generator = NationalDataGenerator(
|
||||
excel_path="input.xlsx",
|
||||
category_column="基础分类"
|
||||
)
|
||||
|
||||
# 生成100条数据
|
||||
df = generator.generate_dataset(100)
|
||||
|
||||
# 保存到Excel
|
||||
with pd.ExcelWriter("government_12345_data.xlsx") as writer:
|
||||
df.to_excel(writer, index=False)
|
||||
|
||||
print("生成数据示例:")
|
||||
print(df[["ID", "内容", "关键词", "参考答案"]].head(3).to_string(index=False))
|
394
data_generate/zw12345/dianhuadialog_generate_demo.py
Normal file
394
data_generate/zw12345/dianhuadialog_generate_demo.py
Normal file
@ -0,0 +1,394 @@
|
||||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import List, Dict, Tuple
|
||||
from openai import OpenAI
|
||||
from faker import Faker
|
||||
|
||||
class FullyDynamicGenerator:
|
||||
def __init__(self):
|
||||
self.llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
self.faker = Faker('zh_CN')
|
||||
self.dynamic_memory = {}
|
||||
self.special_cases = [
|
||||
"方言沟通", "老年人口齿不清", "情绪激动打断对话",
|
||||
"背景噪音干扰", "信号断续"
|
||||
]
|
||||
|
||||
def generate_dialog(self, category: str, subcategory: str) -> List[Dict]:
|
||||
"""全动态对话生成入口"""
|
||||
scene_knowledge = self.generate_scene_knowledge(category, subcategory)
|
||||
self.dynamic_memory[f"{category}_{subcategory}"] = scene_knowledge
|
||||
dialog = []
|
||||
dialog.extend(self._generate_complex_opening(category, subcategory))
|
||||
dialog.extend(self._generate_obstacle_base_phase(scene_knowledge, subcategory))
|
||||
dialog.extend(self._generate_verification_with_challenges(dialog))
|
||||
dialog.extend(self._generate_technical_extend_phase(scene_knowledge, subcategory))
|
||||
dialog.extend(self._generate_final_confirmation(scene_knowledge, subcategory))
|
||||
return self._format_output(dialog)
|
||||
|
||||
def _generate_complex_opening(self, category: str, subcategory: str) -> List[Tuple]:
|
||||
"""生成带复杂情形的开场对话"""
|
||||
phase = []
|
||||
special_case = random.choice(self.special_cases + [None]*3)
|
||||
|
||||
citizen_traits = {
|
||||
"方言": random.choice(["带浓重口音", "夹杂方言词汇", "语法不规范"]),
|
||||
"老年人": random.choice(["说话缓慢", "重复语句", "耳背听不清"]),
|
||||
"情绪化": random.choice(["不断打断", "提高音量", "带哭腔"])
|
||||
}
|
||||
opening_prompt = f"""生成市民反映{subcategory}问题的电话开场白,要求:
|
||||
1. 必须包含"您好"等礼貌用语
|
||||
2. 体现真实通话特征:{citizen_traits.get(special_case, "正常沟通")}
|
||||
3. 包含具体问题细节"""
|
||||
opening = self._safe_llm_call(
|
||||
prompt=opening_prompt,
|
||||
system="你擅长模拟各类人群的真实对话",
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
try:
|
||||
opening_data = json.loads(opening)
|
||||
opening_text = opening_data.get("text", f"您好,我要反映{subcategory}问题")
|
||||
if special_case == "方言沟通":
|
||||
opening_text = self._add_dialect_features(opening_text)
|
||||
except:
|
||||
opening_text = f"您好,我想投诉{subcategory}问题"
|
||||
phase.append(("市民", "open_call", opening_text))
|
||||
response_prompt = f"""根据市民来电特征:{special_case if special_case else '正常'},生成专业应答:
|
||||
1. 包含工号和服务承诺
|
||||
2. 适应沟通特征:{citizen_traits.get(special_case, '标准服务')}"""
|
||||
response = self._safe_llm_call(
|
||||
prompt=response_prompt,
|
||||
system="你是适应力强的专业客服",
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
try:
|
||||
response_data = json.loads(response)
|
||||
response_text = response_data.get("text", f"感谢来电,工号{random.randint(1000,1999)}为您服务")
|
||||
if special_case == "老年人口齿不清":
|
||||
response_text += "(放慢语速)请您慢慢说"
|
||||
except:
|
||||
response_text = "您好,政务热线为您服务"
|
||||
phase.append(("客服", "agent_response", response_text))
|
||||
if special_case in ["方言沟通", "老年人口齿不清", "信号断续"]:
|
||||
phase.append(("客服", "double_check", f"抱歉,刚才没有听清楚,您是说{subcategory}问题对吗?"))
|
||||
phase.append(("市民", "clarify", random.choice([
|
||||
"对,就是这个问题",
|
||||
f"不是,是{random.choice(['更严重','其他'])}的问题",
|
||||
"(声音断断续续)喂...听得到吗?"
|
||||
])))
|
||||
return phase
|
||||
|
||||
def _generate_obstacle_base_phase(self, knowledge: Dict, scene: str) -> List[Tuple]:
|
||||
"""生成带沟通障碍的基础信息采集"""
|
||||
phase = []
|
||||
required_fields = ["时间", "地点", "事件描述", "联系方式", "姓氏"]
|
||||
for field in required_fields:
|
||||
if random.random() < 0.1:
|
||||
unclear_question = self._safe_llm_call(
|
||||
prompt=f"生成有歧义的{field}询问话术",
|
||||
system="故意制造1-2处不明确表述"
|
||||
) or f"那个...关于{field}的情况能不能说下?"
|
||||
phase.append(("客服", "unclear_question", unclear_question))
|
||||
phase.append(("市民", "confused", "您问的是什么?我没听明白"))
|
||||
question = self._safe_llm_call(
|
||||
prompt=f"重新生成清晰的{field}询问话术",
|
||||
system="使用最简明的表达"
|
||||
) or f"请提供{field}的具体信息"
|
||||
phase.append(("客服", "retry_question", question))
|
||||
else:
|
||||
question = self._safe_llm_call(
|
||||
prompt=f"生成政务热线询问{field}的标准话术,场景:{scene}",
|
||||
system="要求:1.使用敬语 2.明确信息要求"
|
||||
) or f"请问{scene}的{field}是?"
|
||||
phase.append(("客服", "info_request", question))
|
||||
answer, needs_clarify = self._generate_complex_answer(scene, field)
|
||||
phase.append(("市民", "info_response", answer))
|
||||
if needs_clarify:
|
||||
clarify_question = self._safe_llm_call(
|
||||
prompt=f"根据模糊回答'{answer}'生成澄清{field}的追问",
|
||||
system="要求:1.指出不明确处 2.提供填写范例"
|
||||
) or f"您提供的{field}不够具体,请补充(例:{self._get_field_example(field)})"
|
||||
phase.append(("客服", "clarify_request", clarify_question))
|
||||
if random.random() < 0.1:
|
||||
phase.append(("市民", "refuse", random.choice([
|
||||
"这么麻烦不说了!",
|
||||
"你们政府办事就是繁琐",
|
||||
f"{field}有什么好问的!"
|
||||
])))
|
||||
phase.append(("客服", "calm_down", random.choice([
|
||||
"理解您的心情,但详细信息能帮助我们更快解决问题",
|
||||
"抱歉给您带来不便,这是必要流程"
|
||||
])))
|
||||
phase.append(("市民", "clarified_response", f"哦,应该是{self._get_field_example(field)}"))
|
||||
return phase
|
||||
|
||||
def _generate_complex_answer(self, scene: str, field: str) -> Tuple[str, bool]:
|
||||
"""生成带复杂特征的市民回答"""
|
||||
if random.random() < 0.15:
|
||||
special_answers = {
|
||||
"时间": [
|
||||
("就...就那个...前几天", True),
|
||||
("(背景嘈杂)喂?时间啊...上周?", True),
|
||||
("我不记得了!你们自己查!", False)
|
||||
],
|
||||
"地点": [
|
||||
("俺们村东头那个...那个啥来着", True),
|
||||
("(信号不好)在...哗哗...超市附近", True),
|
||||
("这么简单的问题都处理不了?", False)
|
||||
]
|
||||
}
|
||||
return random.choice(special_answers.get(field, [("这个我说不好", True)]))
|
||||
answers = {
|
||||
"时间": [
|
||||
(f"{random.choice(['今天','昨天'])}{random.randint(1,12)}点左右", False),
|
||||
(f"持续{random.randint(2,24)}小时了", False)
|
||||
],
|
||||
"地点": [
|
||||
(f"{self.faker.building_number()}号{random.choice(['东侧','南门'])}", False),
|
||||
(f"{self.faker.street_name()}附近", True)
|
||||
],
|
||||
"联系方式": [
|
||||
(f"{self.faker.phone_number()[:3]}****", True),
|
||||
(f"固话:{self.faker.phone_number()[:4]}-{self.faker.phone_number()[-4:]}", False)
|
||||
],
|
||||
"姓氏": [
|
||||
(f"免贵姓{self.faker.last_name()}", False),
|
||||
("叫我老李就行", True)
|
||||
]
|
||||
}
|
||||
return random.choice(answers.get(field, [("具体情况是这样的...", False)]))
|
||||
|
||||
def _generate_verification_with_challenges(self, previous_dialog: List[Tuple]) -> List[Tuple]:
|
||||
"""生成带挑战的信息确认环节"""
|
||||
phase = []
|
||||
collected_info = {}
|
||||
for turn in previous_dialog:
|
||||
if turn[1] in ["info_response", "clarified_response"]:
|
||||
for field in ["时间", "地点", "姓氏"]:
|
||||
if field in turn[2]:
|
||||
collected_info[field] = turn[2]
|
||||
if random.random() < 0.1:
|
||||
collected_info[field] = self._get_wrong_info(field)
|
||||
if collected_info:
|
||||
if random.random() < 0.05:
|
||||
wrong_field = random.choice(list(collected_info.keys()))
|
||||
correct_value = collected_info[wrong_field]
|
||||
collected_info[wrong_field] = self._get_wrong_info(wrong_field)
|
||||
verification_text = self._safe_llm_call(
|
||||
prompt="根据以下信息生成确认话术:" + json.dumps(collected_info, ensure_ascii=False),
|
||||
system="要求:1.逐项确认 2.允许修正"
|
||||
) or f"我确认下:时间:{collected_info.get('时间','')},地点:{collected_info.get('地点','')}..."
|
||||
phase.append(("客服", "info_verification", verification_text))
|
||||
if random.random() < 0.3:
|
||||
correction_field = random.choice(list(collected_info.keys()))
|
||||
phase.append(("市民", "correction",
|
||||
f"{correction_field}不对!应该是{self._get_field_example(correction_field)}"))
|
||||
if random.random() < 0.1:
|
||||
phase.append(("市民", "angry", "你们连基本信息都记错!"))
|
||||
phase.append(("客服", "apology", "非常抱歉,这是我们的失误"))
|
||||
phase.append(("客服", "acknowledge_correction", f"已更正{correction_field}信息"))
|
||||
phase.append(("市民", "final_confirmation", "现在对了"))
|
||||
else:
|
||||
phase.append(("市民", "confirmation", "对,没错"))
|
||||
return phase
|
||||
|
||||
def _generate_technical_extend_phase(self, knowledge: Dict, scene: str) -> List[Tuple]:
|
||||
"""生成带技术障碍的扩展追问"""
|
||||
phase = []
|
||||
for question_config in knowledge.get("extend_questions", []):
|
||||
if random.random() < 0.05:
|
||||
tech_question = self._safe_llm_call(
|
||||
prompt=f"生成包含专业术语的{scene}问题",
|
||||
system="使用3个以上专业词汇"
|
||||
) or f"请问{scene}的{random.choice(['频谱特征','声压级衰减曲线'])}是怎样的?"
|
||||
phase.append(("客服", "technical_question", tech_question))
|
||||
phase.append(("市民", "not_understand", "这些专业名词听不懂"))
|
||||
simplified = self._safe_llm_call(
|
||||
prompt=f"将'{tech_question}'转化为通俗问题",
|
||||
system="用生活化比喻解释"
|
||||
) or f"就是问{scene}的具体表现是怎样的"
|
||||
phase.append(("客服", "simplified_question", simplified))
|
||||
else:
|
||||
question = self._safe_llm_call(
|
||||
prompt=f"基于{scene}场景生成追问:{question_config.get('prompt','')}",
|
||||
system="要求:1.分步骤询问 2.适度专业"
|
||||
) or question_config.get('prompt','')
|
||||
phase.append(("客服", "extend_question", question))
|
||||
if random.random() < 0.15:
|
||||
phase.append(("市民", "broken_response", "喂?...听得到吗?...我说到哪了?"))
|
||||
phase.append(("客服", "reassure", "电话不太稳定,请您继续"))
|
||||
answer = self._generate_realistic_answer(
|
||||
question, scene, question_config.get("theme",""), "extend"
|
||||
)
|
||||
phase.append(("市民", "extend_answer", answer))
|
||||
if random.random() < 0.1:
|
||||
phase.append(("客服", "request_material", "需要您提供现场照片或录音证据"))
|
||||
phase.append(("市民", "material_response", random.choice([
|
||||
"我手机里有,怎么发给你们?",
|
||||
"现在拍不了,你们自己来看!"
|
||||
])))
|
||||
phase.append(("客服", "guide", "可以通过微信公众号'市民服务'上传"))
|
||||
return phase
|
||||
|
||||
def _generate_final_confirmation(self, knowledge: Dict, scene: str) -> List[Tuple]:
|
||||
"""生成最终确认"""
|
||||
phase = []
|
||||
confirmation = self._safe_llm_call(
|
||||
prompt=f"生成{scene}问题的最终确认话术",
|
||||
system="包含:1.处理时限 2.反馈方式 3.应急联系人"
|
||||
) or f"我们将在{random.choice(['24小时','3个工作日'])}内处理您的{scene}问题"
|
||||
phase.append(("客服", "final_confirmation", confirmation))
|
||||
if random.random() < 0.2:
|
||||
phase.append(("市民", "follow_up", random.choice([
|
||||
"如果超时没处理怎么办?",
|
||||
"我要找哪个部门跟进?"
|
||||
])))
|
||||
phase.append(("客服", "replay", random.choice([
|
||||
"可拨打监督电话12345查询进度",
|
||||
"我们会主动给您回复"
|
||||
])))
|
||||
return phase
|
||||
|
||||
def _generate_scene_knowledge(self, category: str, subcategory: str) -> Dict:
|
||||
"""动态生成场景知识图谱"""
|
||||
prompt = f"""作为政务热线专家,请为【{category}->{subcategory}】场景生成知识配置,包含:
|
||||
1. 3-5个必问基础字段(如时间、地点)
|
||||
2. 3个专业追问方向及追问话术模板
|
||||
3. 该场景涉及的相关部门和处理时限参考
|
||||
返回JSON格式,结构示例:
|
||||
{{
|
||||
"base_fields": [
|
||||
{{"field": "时间", "prompt": "询问具体时间的标准话术"}},
|
||||
{{"field": "地点", "prompt": "询问详细位置的专业话术"}}
|
||||
],
|
||||
"extend_questions": [
|
||||
{{"theme": "历史记录", "prompt": "追问历史投诉情况的专业话术"}},
|
||||
{{"theme": "紧急程度", "prompt": "评估问题紧急程度的询问方式"}}
|
||||
],
|
||||
"departments": ["城管局", "环保局"],
|
||||
"time_ranges": ["24小时内", "3个工作日"]
|
||||
}}"""
|
||||
response = self._safe_llm_call(
|
||||
prompt=prompt,
|
||||
system="你是有10年经验的政务热线系统架构师",
|
||||
response_format={"type": "json_object"}
|
||||
)
|
||||
try:
|
||||
knowledge = json.loads(response)
|
||||
knowledge["confirmation_template"] = self._generate_confirmation_template(
|
||||
category, subcategory, knowledge.get("departments", []), knowledge.get("time_ranges", [])
|
||||
)
|
||||
return knowledge
|
||||
except:
|
||||
return self._get_fallback_knowledge(category, subcategory)
|
||||
|
||||
def _generate_confirmation_template(self, category: str, subcategory: str,
|
||||
departments: List[str], time_ranges: List[str]) -> str:
|
||||
"""生成确认话术模板"""
|
||||
prompt = f"""为【{category}->{subcategory}】创建确认话术模板,要求包含:
|
||||
1. 处理部门:{departments}
|
||||
2. 预计时限:{time_ranges}
|
||||
3. 至少2种后续跟进方式
|
||||
模板示例:\"我们将协调{{department}}在{{timeframe}}内处理,可通过{{phone}}或{{wechat}}查询进展\"
|
||||
"""
|
||||
return self._safe_llm_call(
|
||||
prompt=prompt,
|
||||
system="你需创建可参数化的文本模板,用{}标记变量位置"
|
||||
) or f"我们将尽快处理您的{subcategory}问题"
|
||||
|
||||
def _generate_realistic_answer(self, question: str, scene: str,
|
||||
field: str, answer_type: str) -> str:
|
||||
"""生成高真实性回答"""
|
||||
prompt = f"""模拟市民对【{scene}】问题中'{question}'的真实回答,要求:
|
||||
1. 包含具体{field}的细节数据
|
||||
2. 反映真实诉求和情绪梯度
|
||||
3. 使用该场景典型市民的语言特征"""
|
||||
system = {
|
||||
"base": "你是一个普通市民,回答要口语化并带生活细节",
|
||||
"extend": "你是有相关专业知识的市民,回答要包含技术参数和量化描述"
|
||||
}[answer_type]
|
||||
answer = self._safe_llm_call(prompt=prompt, system=system)
|
||||
return answer or self._get_field_example(field)
|
||||
|
||||
def _get_field_example(self, field: str) -> str:
|
||||
"""获取字段示例"""
|
||||
examples = {
|
||||
"时间": "2023年10月15日下午3点20分",
|
||||
"地点": "朝阳区建国路88号地下二层停车场",
|
||||
"联系方式": "13800138000或010-12345678",
|
||||
"姓氏": "张先生/李女士"
|
||||
}
|
||||
return examples.get(field, "具体情况是这样的...")
|
||||
|
||||
def _get_fallback_knowledge(self, category: str, subcategory: str) -> Dict:
|
||||
"""应急知识库"""
|
||||
return {
|
||||
"base_fields": [
|
||||
{"field": "时间", "prompt": f"请问{subcategory}发生的具体时间?"},
|
||||
{"field": "地点", "prompt": f"请说明{category}问题的详细位置?"}
|
||||
],
|
||||
"extend_questions": [
|
||||
{"theme": "基本情况", "prompt": f"请描述{subcategory}的具体表现?"}
|
||||
],
|
||||
"confirmation_template": f"我们将处理您的{category}问题",
|
||||
"departments": ["相关部门"],
|
||||
"time_ranges": ["尽快"]
|
||||
}
|
||||
|
||||
def _add_dialect_features(self, text: str) -> str:
|
||||
"""添加方言特征"""
|
||||
dialects = {
|
||||
"北方方言": [("我", "俺"), ("的", "滴"), ("这个", "这玩意儿")],
|
||||
"南方方言": [("是不是", "系唔系"), ("不知道", "母鸡"), ("说", "讲")]
|
||||
}
|
||||
dialect_type, replacements = random.choice(list(dialects.items()))
|
||||
for orig, rep in replacements:
|
||||
if orig in text:
|
||||
return text.replace(orig, rep)
|
||||
return text + random.choice(["晓得伐?", "中不中?", "得啵?"])
|
||||
|
||||
def _get_wrong_info(self, field) -> str:
|
||||
"""生成错误信息"""
|
||||
wrong_examples = {
|
||||
"时间": random.choice(["昨天", "上周", "记不清了"]),
|
||||
"地点": random.choice(["东边", "路口", "大概位置"]),
|
||||
"姓氏": random.choice(["王", "李", "张"])
|
||||
}
|
||||
return wrong_examples.get(field, "信息有误")
|
||||
|
||||
def _safe_llm_call(self, prompt: str, system: str = None,**kwargs) -> str:
|
||||
"""带熔断机制的API调用"""
|
||||
try:
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
if system:
|
||||
messages.insert(0, {"role": "system", "content": system})
|
||||
response = self.llm.chat.completions.create(
|
||||
model="gpt-4-turbo",
|
||||
messages=messages,
|
||||
temperature=0.7,
|
||||
max_tokens=400,
|
||||
**kwargs
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
print(f"API异常: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _format_output(self, dialog: List[Tuple]) -> List[Dict]:
|
||||
"""格式化输出"""
|
||||
return [{
|
||||
"turn": idx+1,
|
||||
"speaker": speaker,
|
||||
"type": dtype,
|
||||
"content": content
|
||||
} for idx, (speaker, dtype, content) in enumerate(dialog)]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
||||
generator = FullyDynamicGenerator()
|
||||
dialog = generator.generate_dialog("城乡建设", "施工噪音")
|
||||
print("\n=== 政务热线完整对话 ===")
|
||||
for turn in dialog:
|
||||
print(f"{turn['turn']}. [{turn['speaker']}][{turn['type']}] {turn['content']}")
|
206
data_generate/zw12345/yanpanbaogaozongjie_demo.py
Normal file
206
data_generate/zw12345/yanpanbaogaozongjie_demo.py
Normal file
@ -0,0 +1,206 @@
|
||||
from openai import OpenAI, APIError, RateLimitError, AuthenticationError
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
import ast
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
|
||||
class GovDataGenerator:
|
||||
def __init__(self, api_key: str):
|
||||
api_key = "sk-af3d1a3ed4d64df09249383a76fa12f4"
|
||||
base_url = "https://api.deepseek.com"
|
||||
self.client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
self.config_cache = {}
|
||||
self._init_prompts()
|
||||
|
||||
def _init_prompts(self):
|
||||
"""Initialize all prompt templates"""
|
||||
self.base_prompt = """请生成与【{title}】相关的5个真实政务12345业务案例,每个案例包含:
|
||||
1. 当事人(如张先生)+ 问题场景(如XX街道XX小区)
|
||||
2. 业务类型(如社保缴纳)+ 具体问题(如未回复)
|
||||
3. 涉及单位(如医保局)+ 时间要素(如8月17日)
|
||||
4. 证件编号(如370181XXXXXXXXXX)+ 政策条件(如连续缴费12个月)
|
||||
|
||||
每个案例生成6-8个关键词,格式:
|
||||
[案例1] 关键词:关键词1,关键词2,...
|
||||
[案例2] 关键词:关键词A,关键词B,...
|
||||
请用中文逗号分隔,不要编号"""
|
||||
|
||||
self.config_prompt = """请根据政务领域【{title}】生成:
|
||||
1. 3-5个核心业务分类(categories)
|
||||
2. 1条特别生成要求(requirements)
|
||||
|
||||
示例(保险领域):
|
||||
categories: ["医疗保险", "失业保险", "养老保险", "生育保险"]
|
||||
requirements: "需包含医保报销和生育津贴案例各1个"
|
||||
|
||||
请用JSON格式返回:{{"categories": [], "requirements": ""}}"""
|
||||
|
||||
def _call_gpt(self, prompt: str, **kwargs) -> str:
|
||||
"""统一GPT调用入口(适配v0.28+)"""
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
temperature=kwargs.get('temperature', 0.7),
|
||||
max_tokens=kwargs.get('max_tokens', 800),
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
stream=False
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
except RateLimitError:
|
||||
raise Exception(f"请求超频,请等待后重试(错误码429)")
|
||||
except AuthenticationError:
|
||||
raise Exception("API密钥无效,请检查密钥是否正确")
|
||||
except APIError as e:
|
||||
raise Exception(f"API错误: {e.code} - {e.message}")
|
||||
except Exception as e:
|
||||
raise Exception(f"请求失败: {str(e)}")
|
||||
|
||||
def generate_dynamic_config(self, title: str) -> Tuple[List[str], str]:
|
||||
"""动态生成领域配置"""
|
||||
if title in self.config_cache:
|
||||
return self.config_cache[title]
|
||||
|
||||
try:
|
||||
prompt = self.config_prompt.format(title=title)
|
||||
raw_text = self._call_gpt(prompt, temperature=0.5, max_tokens=300)
|
||||
parsed = self.safe_parse_config(raw_text)
|
||||
|
||||
# 验证数据结构
|
||||
if not isinstance(parsed.get("categories", []), list) or \
|
||||
not isinstance(parsed.get("requirements", ""), str):
|
||||
raise ValueError("配置格式错误")
|
||||
|
||||
self.config_cache[title] = (parsed["categories"], parsed["requirements"])
|
||||
return self.config_cache[title]
|
||||
|
||||
except Exception as e:
|
||||
print(f"配置生成失败: {str(e)},使用默认配置")
|
||||
return [], ""
|
||||
|
||||
def safe_parse_config(self, text: str) -> Dict:
|
||||
"""安全解析配置响应"""
|
||||
try:
|
||||
# 尝试标准JSON解析
|
||||
json_str = re.search(r'\{.*\}', text, re.DOTALL).group()
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
# 尝试容错解析
|
||||
try:
|
||||
return ast.literal_eval(json_str.replace('"', "'"))
|
||||
except:
|
||||
# 最终容错方案
|
||||
return {
|
||||
"categories": list(set(re.findall(r'"([^"]+)"', text)))[:3],
|
||||
"requirements": re.split(r'[::]', text.split("requirements")[-1])[-1].strip('"\'')
|
||||
}
|
||||
except Exception:
|
||||
return {"categories": [], "requirements": ""}
|
||||
|
||||
def get_prompt(self, title: str) -> str:
|
||||
"""构建动态提示"""
|
||||
categories, requirements = self.generate_dynamic_config(title)
|
||||
prompt = self.base_prompt.format(title=title)
|
||||
|
||||
if requirements:
|
||||
prompt += f"\n特别要求:{requirements}"
|
||||
if categories:
|
||||
prompt += f"\n参考分类:{', '.join(categories[:3])}..."
|
||||
|
||||
return prompt
|
||||
|
||||
def generate_keywords(self, title: str) -> List[str]:
|
||||
"""生成关键词主流程"""
|
||||
try:
|
||||
# 获取配置和生成内容
|
||||
categories, _ = self.generate_dynamic_config(title)
|
||||
prompt = self.get_prompt(title)
|
||||
raw_text = self._call_gpt(prompt)
|
||||
|
||||
# 处理响应
|
||||
return self.process_response(raw_text, categories)
|
||||
except Exception as e:
|
||||
print(f"关键词生成失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def process_response(self, raw_text: str, categories: List[str]) -> List[str]:
|
||||
"""处理生成的响应内容"""
|
||||
# 解析案例
|
||||
cases = re.findall(r'关键词:(.*?)(?=\n\[案例|\n\n|$)', raw_text, re.DOTALL)
|
||||
keywords = []
|
||||
for case in cases:
|
||||
keywords.extend([k.strip() for k in case.split(',') if k.strip()])
|
||||
|
||||
# 分级抽样
|
||||
sampled = []
|
||||
if categories:
|
||||
for cat in categories[:3]: # 取前3个分类
|
||||
matches = [k for k in keywords if cat in k][:5] # 每个分类最多取5个
|
||||
sampled.extend(matches)
|
||||
|
||||
# 合并去重
|
||||
seen = set()
|
||||
final = []
|
||||
for k in sampled + keywords:
|
||||
if k not in seen:
|
||||
seen.add(k)
|
||||
final.append(k)
|
||||
|
||||
return self.post_process(final[:60])[:50] # 最终保留50个
|
||||
|
||||
def post_process(self, keywords: List[str]) -> List[str]:
|
||||
"""后处理管道"""
|
||||
processed = []
|
||||
for k in keywords:
|
||||
# 标准化处理
|
||||
k = re.sub(r'\s+', ' ', k).strip()
|
||||
|
||||
# 过滤无效条目
|
||||
if len(k) < 4 or '...' in k:
|
||||
continue
|
||||
|
||||
# 增强关键条目
|
||||
if any(c.isdigit() for c in k) or re.search(r'[市区街道]', k):
|
||||
processed.append(k)
|
||||
|
||||
# 优先级排序
|
||||
priority_terms = ['社保', '医保', '房产证', '施工许可']
|
||||
return sorted(processed,
|
||||
key=lambda x: any(t in x for t in priority_terms),
|
||||
reverse=True)
|
||||
|
||||
def save_to_tsv(self, data: List[dict], filename: str):
|
||||
"""保存结果到TSV文件"""
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f, delimiter='\t')
|
||||
writer.writerow(['title', 'contents'])
|
||||
for item in data:
|
||||
writer.writerow([
|
||||
item['title'],
|
||||
json.dumps(item['keywords'], ensure_ascii=False)
|
||||
])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 使用示例
|
||||
generator = GovDataGenerator("sk-your-api-key-here")
|
||||
|
||||
# 生成数据
|
||||
results = []
|
||||
for domain in ["保险", "城市管理", "房产"]:
|
||||
print(f"正在处理:{domain}")
|
||||
keywords = generator.generate_keywords(domain)
|
||||
results.append({
|
||||
"title": domain,
|
||||
"keywords": keywords
|
||||
})
|
||||
print(f"生成完成,获得{len(keywords)}个关键词")
|
||||
|
||||
# 保存结果
|
||||
generator.save_to_tsv(results, "government_data.tsv")
|
||||
print("数据已保存至 government_data.tsv")
|
Loading…
Reference in New Issue
Block a user