【文獻應用分享】FinGPT: Open-Source Financial Large Language Models

Date

September 10, 2023

類型

文獻分享

程度

高階

領域

資料科學量化交易金融資料資料分析機器學習/深度學習

文獻資源

Embed GitHub

oliverwang15 (Oliver Wang)

User profile of Oliver Wang on Hugging Face

huggingface.co

FinGPT: Open-Source Financial Large Language Models

Large language models (LLMs) have shown the potential of revolutionizing natural language processing tasks in diverse domains, sparking great interest in finance. Accessing high-quality financial...

arxiv.org

文獻簡介與安裝使用

請看【技術嘗試 Try Technique】嘗試一個微調金融領域的 LLM - FinGPT - 來辨識新聞情緒

本日分享

這周不講論文內容，改帶大家體驗 FinGPT。

基礎的介紹在公開文章可以看到，而給訂閱制學員的分享主要是多了一份程式碼：

from transformers import AutoModel, AutoTokenizer
from peft import PeftModel
from torch import cuda
from sqlalchemy import create_engine
import pandas as pd

import time


def get_db_engine(user, password, ip, port, database):
    return create_engine(
        f'postgresql://{user}:{password}@{ip}:{port}/{database}')
    

def load_news_dataset(start_date, date_col, table_name, db_cofig):
    sql = f"""SELECT * FROM "{table_name}"
    WHERE "{date_col}" >= '{start_date}';"""
    
    return pd.read_sql(
        sql,
        get_db_engine(**db_cofig)
    )


def generate_prompts(titles):
    titles = pd.Series(titles)
    instruction_prompt = 'Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}'
    input_prompt = 'Input: '
    answer_prompt = 'Answer: '
    titles = titles.apply(lambda x: f'{instruction_prompt}\n{input_prompt}{x}\n{answer_prompt}')
    return titles.tolist()


def load_model(base_model_path: str, peft_model_path: str, cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_path,
        trust_remote_code=True,
        cache_dir=cache_dir
    )
    model = AutoModel.from_pretrained(
        base_model_path,
        trust_remote_code=True, 
        device_map = "auto",
        cache_dir=cache_dir
    )
    model = PeftModel.from_pretrained(
        model,
        peft_model_path,
        cache_dir=cache_dir
    )
    model = model.eval()
    
    return tokenizer, model
    


def predict(prompts: list, model, tokenizer, device):
    print('---[Tokenize]...')
    start_time = time.time()
    tokens = tokenizer(prompts, return_tensors='pt', padding=True)
    print(f'------>[Tokenized] {time.time() - start_time}(s)')
    
    tokens.to(device)
    model = model.to(device)
    
    print('---[Generate Answer]...')
    res = model.generate(**tokens, max_length=512)
    print(f'------>[Generated] {time.time() - start_time}(s)')
    
    print('---[Decode and Get Key Answer]...')
    result_texts = []
    count = 0
    for i in res:
        print(f'------{count + 1}/{len(res)}')
        sentence = tokenizer.decode(i)
        key_answer = sentence.split('Answer: ')[1].replace(' ', '')
        result_texts.append(key_answer)
        count += 1
        
    # res_sentences = [tokenizer.decode(i) for i in res]
    # result_texts = [o.split("Answer: ")[1] for o in res_sentences]
    print(f'------>[Get] {time.time() - start_time}(s)')
    
    return result_texts


def process_batch(df, batch_prompts, model, tokenizer, device, db_table, db_cofig):
    # Predict sentiment.
    print(f'---[Predict Batch]...')
    start_time = time.time()
    result_texts = predict(batch_prompts, model, tokenizer, device)
    print(f'---> [Predicted Batch] {time.time() - start_time}(s)')
    # print(f'------> {result_texts}')
    
    # Save to DB.
    df['sentiment'] = result_texts
    save_to_db(df, db_table, db_cofig)


def save_to_db(df, db_table, db_cofig):
    df.to_sql(
        name=db_table,
        con=get_db_engine(**db_cofig),
        if_exists='append',
        index=False,
        chunksize=1000,
    )


if __name__ == '__main__':
    start_date = '2022-01-01'
    base_model_path = 'THUDM--chatglm2-6b'
    peft_model_path = 'oliverwang15--FinGPT_v31_ChatGLM2_Sentiment_Instruction_LoRA_FT/'
    cache_dir = './'
    load_db_table = ''
    save_db_table = ''
    db_date_col = 'date'
    db_title_col = 'headline'
    db_cofig = {
        'user': '', 
        'password': '',
        'ip': '',
        'port': '',
        'database': ''
    }
    
    # Get device.
    device = 'cuda' if cuda.is_available() else 'cpu'
    cuda.empty_cache()
    print(f'Devicd: {device}')

    # Get tokenizer, model.
    print('[Load Model and Tokenizer]...')
    start_time = time.time()
    tokenizer, model = load_model(
        base_model_path,
        peft_model_path,
        cache_dir
    )
    print(f'---> [Loaded] {time.time() - start_time}(s)')
    
    # Get dataset and generate promtps.
    print('[Load Dataset and Generate Prompts]...')
    start_time = time.time()
    
    df = load_news_dataset(start_date, db_date_col, load_db_table, db_cofig)
    prompts = generate_prompts(df[db_title_col])
    
    print(f'---> [Loaded and Generated] {time.time() - start_time}(s), {len(prompts)} rows.')
    
    # Divide to batch to predict.
    print(f'[Predict]...')
    start_time = time.time()
    batch_size = 100
    count = 0
    batch_num = round(len(prompts) / batch_size, 2)
    for i in range(0, len(prompts), batch_size):
        print(f'---(Batch {count + 1}/{batch_num})')
        batch = prompts[i:i + batch_size]
        process_batch(
            df.iloc[i:i + batch_size, :].copy(),
            batch,
            model,
            tokenizer,
            device,
            save_db_table,
            db_cofig
        )
        cuda.empty_cache()
        count += 1
    print(f'---> [Predicted] {time.time() - start_time}(s)')

如果你有一個 PostgreSQL，裡面存了一堆新聞標題，可以用這個程式碼來做批量情緒分析，然後會存到另外一張表（含原有欄位＋情緒分析結果）。

當然你也可以改成適用於別的資料庫或是 CSV 的程式碼。

你也可以修改一下，最簡單用 input 變成對話機器人。

記得要改模型路徑還有資料庫表的名字那些變數。

PS. 因為很多人沒有 OOP 概念，所以這份程式碼我抽出來變以上分享的樣子，可能比較好懂，但擴展性比較差，如果你是有能力注意程式開發擴展性的人，可以自己修改一下。

另外可能有人覺得奇怪，為什麼我連續兩篇都分享金融領域 LLMs。原因是我因為研究題目的關係，所以在鑽研這部分。而我研究的題目跟量化交易是有關係的。對，也就是告訴你，LLMs 真的能幫助提升量化交易的部分（已有文獻）。

能幫助的方面不少，未來我們可以慢慢介紹。

PS. 論文寫得不難（文字上），大家有興趣可以看看。