154 lines
5.7 KiB
Python
154 lines
5.7 KiB
Python
|
"""
|
|||
|
政务12345全国数据生成系统
|
|||
|
功能:
|
|||
|
1. 支持全国范围地理位置生成
|
|||
|
2. 多层级分类扩展
|
|||
|
3. 数据保存至Excel
|
|||
|
4. 真实业务场景模拟
|
|||
|
"""
|
|||
|
|
|||
|
import pandas as pd
|
|||
|
from openai import OpenAI
|
|||
|
import random
|
|||
|
import time
|
|||
|
import re
|
|||
|
from typing import List, Dict, Tuple
|
|||
|
|
|||
|
client = OpenAI(api_key="your-api-key")
|
|||
|
|
|||
|
class NationalDataGenerator:
|
|||
|
def __init__(self, excel_path: str, category_column: str):
|
|||
|
self.base_categories = self._load_excel_categories(excel_path, category_column)
|
|||
|
self.location_pool = self._generate_national_locations()
|
|||
|
self.expanded_categories = self._expand_categories_with_gpt()
|
|||
|
self.used_records = set()
|
|||
|
|
|||
|
def _load_excel_categories(self, path: str, column: str) -> List[str]:
|
|||
|
"""从Excel读取基础分类"""
|
|||
|
df = pd.read_excel(path)
|
|||
|
return df[column].dropna().unique().tolist()
|
|||
|
|
|||
|
def _generate_national_locations(self, num=200) -> List[str]:
|
|||
|
"""生成全国真实地理位置库"""
|
|||
|
response = client.chat.completions.create(
|
|||
|
model="gpt-4-turbo",
|
|||
|
messages=[{
|
|||
|
"role": "user",
|
|||
|
"content": f"生成{num}个中国各城市真实存在的地理位置,按省市区三级格式,示例:\n- 广东省广州市天河区珠江新城\n- 浙江省杭州市余杭区未来科技城"
|
|||
|
}]
|
|||
|
)
|
|||
|
return [line.split(" ")[1] for line in response.choices[0].message.content.strip().split("\n")]
|
|||
|
|
|||
|
def _expand_categories_with_gpt(self) -> Dict[str, List[str]]:
|
|||
|
"""GPT扩展分类体系"""
|
|||
|
category_map = {}
|
|||
|
for base_cat in self.base_categories:
|
|||
|
response = client.chat.completions.create(
|
|||
|
model="gpt-4-turbo",
|
|||
|
messages=[{
|
|||
|
"role": "user",
|
|||
|
"content": f"生成与【{base_cat}】相关但具有政务场景区分度的5个细分类型,示例:\n- 类型1:施工许可违规\n- 类型2:夜间施工超时"
|
|||
|
}]
|
|||
|
)
|
|||
|
sub_cats = [re.sub(r":.*", "", line.split(" ")[1])
|
|||
|
for line in response.choices[0].message.content.strip().split("\n")]
|
|||
|
category_map[base_cat] = sub_cats
|
|||
|
time.sleep(1)
|
|||
|
return category_map
|
|||
|
|
|||
|
def generate_dataset(self, num_records: int) -> pd.DataFrame:
|
|||
|
"""生成核心数据集"""
|
|||
|
data = []
|
|||
|
while len(data) < num_records:
|
|||
|
base_cat = random.choice(self.base_categories)
|
|||
|
sub_cat = random.choice(self.expanded_categories[base_cat])
|
|||
|
location = random.choice(self.location_pool)
|
|||
|
|
|||
|
content, keywords = self._generate_content(base_cat, sub_cat, location)
|
|||
|
if content and self._validate_record(content, keywords, base_cat):
|
|||
|
data.append({
|
|||
|
"ID": len(data)+1,
|
|||
|
"内容": content,
|
|||
|
"关键词": " ".join(keywords),
|
|||
|
"参考答案": base_cat,
|
|||
|
"细分类型": sub_cat,
|
|||
|
"地理位置": location
|
|||
|
})
|
|||
|
time.sleep(1.2)
|
|||
|
|
|||
|
return pd.DataFrame(data)
|
|||
|
|
|||
|
def _generate_content(self, base_cat: str, sub_cat: str, location: str) -> Tuple[str, List[str]]:
|
|||
|
"""生成政务工单内容"""
|
|||
|
prompt = f"""生成真实可信的12345政务工单,要求:
|
|||
|
1. 主分类:【{base_cat}】
|
|||
|
2. 细分类型:【{sub_cat}】
|
|||
|
3. 发生地点:【{location}】
|
|||
|
4. 包含要素:时间、具体问题、影响范围、市民诉求
|
|||
|
5. 生成5个关键词(必须包含{base_cat})
|
|||
|
6. 内容长度80-150字
|
|||
|
|
|||
|
示例格式:
|
|||
|
市民反映{location}某建筑工地违规夜间施工至凌晨,噪音严重干扰周边居民。已向环保部门投诉3次未解决,要求立即停工整顿。
|
|||
|
关键词:夜间施工 噪音污染 环保投诉 施工许可 居民维权"""
|
|||
|
|
|||
|
try:
|
|||
|
response = client.chat.completions.create(
|
|||
|
model="gpt-4-turbo",
|
|||
|
messages=[
|
|||
|
{"role": "system", "content": "你是政务数据生成专家"},
|
|||
|
{"role": "user", "content": prompt}
|
|||
|
],
|
|||
|
temperature=0.7,
|
|||
|
max_tokens=600
|
|||
|
)
|
|||
|
raw_text = response.choices[0].message.content.strip()
|
|||
|
return self._parse_generated_text(raw_text)
|
|||
|
except Exception as e:
|
|||
|
print(f"生成失败:{str(e)}")
|
|||
|
return None, []
|
|||
|
|
|||
|
def _parse_generated_text(self, text: str) -> Tuple[str, List[str]]:
|
|||
|
"""解析生成文本"""
|
|||
|
content = re.sub(r"关键词:.*", "", text).strip()
|
|||
|
keywords = re.findall(r"关键词:(.+)", text)[0].split()[:5]
|
|||
|
return content, keywords
|
|||
|
|
|||
|
def _validate_record(self, content: str, keywords: List[str], category: str) -> bool:
|
|||
|
"""五重数据校验"""
|
|||
|
return (
|
|||
|
len(content) >= 80 and
|
|||
|
len(keywords) == 5 and
|
|||
|
category in keywords and
|
|||
|
content not in self.used_records and
|
|||
|
any(c.isdigit() for c in content) # 包含数字要素
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
# 输入文件示例(input.xlsx)
|
|||
|
"""
|
|||
|
| 基础分类 |
|
|||
|
|--------------|
|
|||
|
| 施工管理 |
|
|||
|
| 消费维权 |
|
|||
|
| 城市管理 |
|
|||
|
| 公共服务 |
|
|||
|
"""
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
# 初始化生成器
|
|||
|
generator = NationalDataGenerator(
|
|||
|
excel_path="input.xlsx",
|
|||
|
category_column="基础分类"
|
|||
|
)
|
|||
|
|
|||
|
# 生成100条数据
|
|||
|
df = generator.generate_dataset(100)
|
|||
|
|
|||
|
# 保存到Excel
|
|||
|
with pd.ExcelWriter("government_12345_data.xlsx") as writer:
|
|||
|
df.to_excel(writer, index=False)
|
|||
|
|
|||
|
print("生成数据示例:")
|
|||
|
print(df[["ID", "内容", "关键词", "参考答案"]].head(3).to_string(index=False))
|