streamlit-testGenius/testGrDownload.py

113 lines
4.7 KiB
Python
Raw Permalink Normal View History

2024-05-28 19:29:43 +08:00
import gradio as gr
import pandas as pd
import tempfile
from http import HTTPStatus
import dashscope
from dashscope import Generation
2024-05-30 17:11:39 +08:00
import os
2024-05-31 19:03:05 +08:00
from testAny import check_df_english, check_df_tags
2024-06-07 18:55:36 +08:00
2024-05-30 17:11:39 +08:00
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY") # Vincent's API key
2024-05-28 19:29:43 +08:00
2024-05-30 17:11:39 +08:00
# todo: delete instruction part or make it optional
2024-06-07 18:55:36 +08:00
# todo: add a checkbox to choose whether to use instruction or not
2024-05-30 17:11:39 +08:00
def response(prompt, instruction=None):
messages = [{'role': 'user', 'content': prompt}]
if instruction is not None: # 如果提供了指令则添加到messages中
messages.insert(0, {'role': 'system', 'content': instruction})
2024-05-28 19:29:43 +08:00
response = Generation.call(model='qwen-plus',
messages=messages,
seed=1234,
result_format='message',
stream=False,
incremental_output=False,
temperature=1.8,
top_p=0.9,
top_k=999
)
if response.status_code == HTTPStatus.OK:
message = response.output.choices[0]['message']['content']
return message
else:
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
response.request_id, response.status_code,
response.code, response.message
))
return f"Error: Could not generate response with Status code: {response.status_code}, error code: {response.code}"
2024-05-30 17:11:39 +08:00
def format_full_prompt(df, introduction):
2024-06-07 18:55:36 +08:00
# 为每个 row 创建 context拼接RAG1和2
2024-05-28 19:29:43 +08:00
df['context'] = df.apply(lambda row: f"{row['RAG1']}-{row['RAG2']}", axis=1)
# 准备用于 format 的字典
2024-06-07 18:55:36 +08:00
column_list = df.drop('full_prompt', axis=1).columns.tolist() # 去除full_prompt列其他的都为参数
format_dict = df[column_list].apply(lambda x: dict(zip(x.index, x)), axis=1)
if len(introduction) >= 200:
2024-05-30 17:11:39 +08:00
df['full_prompt'] = introduction
2024-05-28 19:29:43 +08:00
# 使用 apply() 和 lambda 函数格式化 full_prompt 列
df['full_prompt'] = df.apply(lambda row: row['full_prompt'].format(**format_dict[row.name]), axis=1)
# 可选:删除临时创建的 context 列
df.drop(columns=['context'], inplace=True)
return df
2024-06-04 00:04:12 +08:00
def process_xlsx(xlsx_file, instruction=None, loops=1): # 这里也使instruction参数变成可选
2024-05-28 19:29:43 +08:00
# 读取xlsx文件到pandas DataFrame
df = pd.read_excel(xlsx_file)
# 格式化prompts
2024-05-30 17:11:39 +08:00
formatted_df = format_full_prompt(df, instruction)
2024-06-04 00:04:12 +08:00
if loops >= 1:
df_list = [formatted_df.copy() for _ in range(loops)]
# 使用pd.concat一次性合并所有副本
formatted_df = pd.concat(df_list, ignore_index=True)
2024-05-28 19:29:43 +08:00
# 假设我们要处理的提示是DataFrame的'full_prompt'列
2024-05-30 17:11:39 +08:00
# 调用response时根据instruction是否为None自动处理
2024-05-28 19:29:43 +08:00
formatted_df['Response'] = formatted_df['full_prompt'].apply(lambda prompt: response(prompt, instruction))
2024-06-04 00:04:12 +08:00
# check df with tags and english
formatted_df = check_df_tags(formatted_df)
formatted_df = check_df_english(formatted_df)
2024-05-28 19:29:43 +08:00
# 使用tempfile创建一个临时文件路径保存处理后的xlsx
tmp_path = tempfile.NamedTemporaryFile(delete=True, suffix='.xlsx').name
formatted_df.to_excel(tmp_path, index=False, engine='openpyxl')
return formatted_df, tmp_path
def main():
with gr.Blocks() as demo:
gr.Markdown("### 大模型xlsx处理工具")
with gr.Accordion("输入说明"):
gr.Markdown("请上传一个xlsx文件文件应包含prompts。")
system_instruction = gr.Textbox(label="System Instruction", lines=2,
2024-05-30 17:11:39 +08:00
value=" ")
2024-06-04 00:04:12 +08:00
slider = gr.Slider(minimum=1, maximum=10, step=1, label="循环次数", value=1)
2024-05-28 19:29:43 +08:00
file_input = gr.File(label="上传xlsx文件")
submit_button = gr.Button("处理xlsx")
output_table = gr.Dataframe(label="处理后的数据")
output_file = gr.File(label="下载处理后的文件")
clear_data = gr.ClearButton(components=[output_table, output_file], value="Clear processed data")
clear_all = gr.ClearButton(components=[file_input, output_table, output_file], value="Clear console")
2024-06-04 00:04:12 +08:00
def update_output(xlsx_file, instruction, loops):
2024-05-28 19:29:43 +08:00
if xlsx_file is not None:
2024-06-04 00:04:12 +08:00
formatted_df, tmp_path = process_xlsx(xlsx_file, instruction, loops=loops)
2024-05-28 19:29:43 +08:00
return formatted_df, tmp_path # 返回DataFrame和文件路径
2024-06-04 00:04:12 +08:00
submit_button.click(fn=update_output, inputs=[file_input, system_instruction, slider],
2024-05-28 19:29:43 +08:00
outputs=[output_table, output_file])
demo.launch()
if __name__ == "__main__":
main()