Date
September 10, 2023
類型
文獻分享
程度
高階
領域
資料科學量化交易金融資料資料分析機器學習/深度學習
文獻資源
文獻簡介與安裝使用
請看 【技術嘗試 Try Technique】嘗試一個微調金融領域的 LLM - FinGPT - 來辨識新聞情緒
本日分享
這周不講論文內容,改帶大家體驗 FinGPT。
基礎的介紹在公開文章可以看到,而給訂閱制學員的分享主要是多了一份程式碼:
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel
from torch import cuda
from sqlalchemy import create_engine
import pandas as pd
import time
def get_db_engine(user, password, ip, port, database):
return create_engine(
f'postgresql://{user}:{password}@{ip}:{port}/{database}')
def load_news_dataset(start_date, date_col, table_name, db_cofig):
sql = f"""SELECT * FROM "{table_name}"
WHERE "{date_col}" >= '{start_date}';"""
return pd.read_sql(
sql,
get_db_engine(**db_cofig)
)
def generate_prompts(titles):
titles = pd.Series(titles)
instruction_prompt = 'Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}'
input_prompt = 'Input: '
answer_prompt = 'Answer: '
titles = titles.apply(lambda x: f'{instruction_prompt}\n{input_prompt}{x}\n{answer_prompt}')
return titles.tolist()
def load_model(base_model_path: str, peft_model_path: str, cache_dir: str):
tokenizer = AutoTokenizer.from_pretrained(
base_model_path,
trust_remote_code=True,
cache_dir=cache_dir
)
model = AutoModel.from_pretrained(
base_model_path,
trust_remote_code=True,
device_map = "auto",
cache_dir=cache_dir
)
model = PeftModel.from_pretrained(
model,
peft_model_path,
cache_dir=cache_dir
)
model = model.eval()
return tokenizer, model
def predict(prompts: list, model, tokenizer, device):
print('---[Tokenize]...')
start_time = time.time()
tokens = tokenizer(prompts, return_tensors='pt', padding=True)
print(f'------>[Tokenized] {time.time() - start_time}(s)')
tokens.to(device)
model = model.to(device)
print('---[Generate Answer]...')
res = model.generate(**tokens, max_length=512)
print(f'------>[Generated] {time.time() - start_time}(s)')
print('---[Decode and Get Key Answer]...')
result_texts = []
count = 0
for i in res:
print(f'------{count + 1}/{len(res)}')
sentence = tokenizer.decode(i)
key_answer = sentence.split('Answer: ')[1].replace(' ', '')
result_texts.append(key_answer)
count += 1
# res_sentences = [tokenizer.decode(i) for i in res]
# result_texts = [o.split("Answer: ")[1] for o in res_sentences]
print(f'------>[Get] {time.time() - start_time}(s)')
return result_texts
def process_batch(df, batch_prompts, model, tokenizer, device, db_table, db_cofig):
# Predict sentiment.
print(f'---[Predict Batch]...')
start_time = time.time()
result_texts = predict(batch_prompts, model, tokenizer, device)
print(f'---> [Predicted Batch] {time.time() - start_time}(s)')
# print(f'------> {result_texts}')
# Save to DB.
df['sentiment'] = result_texts
save_to_db(df, db_table, db_cofig)
def save_to_db(df, db_table, db_cofig):
df.to_sql(
name=db_table,
con=get_db_engine(**db_cofig),
if_exists='append',
index=False,
chunksize=1000,
)
if __name__ == '__main__':
start_date = '2022-01-01'
base_model_path = 'THUDM--chatglm2-6b'
peft_model_path = 'oliverwang15--FinGPT_v31_ChatGLM2_Sentiment_Instruction_LoRA_FT/'
cache_dir = './'
load_db_table = ''
save_db_table = ''
db_date_col = 'date'
db_title_col = 'headline'
db_cofig = {
'user': '',
'password': '',
'ip': '',
'port': '',
'database': ''
}
# Get device.
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(f'Devicd: {device}')
# Get tokenizer, model.
print('[Load Model and Tokenizer]...')
start_time = time.time()
tokenizer, model = load_model(
base_model_path,
peft_model_path,
cache_dir
)
print(f'---> [Loaded] {time.time() - start_time}(s)')
# Get dataset and generate promtps.
print('[Load Dataset and Generate Prompts]...')
start_time = time.time()
df = load_news_dataset(start_date, db_date_col, load_db_table, db_cofig)
prompts = generate_prompts(df[db_title_col])
print(f'---> [Loaded and Generated] {time.time() - start_time}(s), {len(prompts)} rows.')
# Divide to batch to predict.
print(f'[Predict]...')
start_time = time.time()
batch_size = 100
count = 0
batch_num = round(len(prompts) / batch_size, 2)
for i in range(0, len(prompts), batch_size):
print(f'---(Batch {count + 1}/{batch_num})')
batch = prompts[i:i + batch_size]
process_batch(
df.iloc[i:i + batch_size, :].copy(),
batch,
model,
tokenizer,
device,
save_db_table,
db_cofig
)
cuda.empty_cache()
count += 1
print(f'---> [Predicted] {time.time() - start_time}(s)')
如果你有一個 PostgreSQL,裡面存了一堆新聞標題,可以用這個程式碼來做批量情緒分析,然後會存到另外一張表(含原有欄位+情緒分析結果)。
當然你也可以改成適用於別的資料庫或是 CSV 的程式碼。
你也可以修改一下,最簡單用 input
變成對話機器人。
記得要改模型路徑還有資料庫表的名字那些變數。
PS. 因為很多人沒有 OOP 概念,所以這份程式碼我抽出來變以上分享的樣子,可能比較好懂,但擴展性比較差,如果你是有能力注意程式開發擴展性的人,可以自己修改一下。
另外可能有人覺得奇怪,為什麼我連續兩篇都分享金融領域 LLMs。原因是我因為研究題目的關係,所以在鑽研這部分。而我研究的題目跟量化交易是有關係的。對,也就是告訴你,LLMs 真的能幫助提升量化交易的部分(已有文獻)。
能幫助的方面不少,未來我們可以慢慢介紹。
PS. 論文寫得不難(文字上),大家有興趣可以看看。