import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

# Import the Llama class from the llama_cpp library
from llama_cpp import Llama

# Define the path to the Llama model file (e.g., a .gguf model file)
path_to_model = "model/llama-2-7b.Q4_K_M.gguf"

# Initialize the Llama model by loading it from the specified model path
llm = Llama(model_path=path_to_model, 
            n_ctx=2048, # context size or the maximum number of tokens that llm can consider at one time
            seed=1337, # set random seed
            n_gpu_layers=-1) # number of layers in llm that are offloaded to the GPU for computation
clear_output(wait=True)
print(f'load pretrained {path_to_model}')

load pretrained model/llama-2-7b.Q4_K_M.gguf

# Generate a response by passing a question to the Llama model
query = "QQ: Can you tell me what is the distance between the eath to the moon? A:"
output = llm(query,
             max_tokens=50,
            stop=["QQ:", "\n"],)

# stop=["QQ:", "\n"]: This means that the model will stop generating text if it 
# produces the string "QQ:" or encounters a newline character (\n), 
# whichever comes first.

# Print the model's generated response
clear_output(wait=True)
print(output)

{'id': 'cmpl-9580af83-712b-48aa-a338-6e73e7987e4c', 'object': 'text_completion', 'created': 1735304872, 'model': 'model/llama-2-7b.Q4_K_M.gguf', 'choices': [{'text': ' If the sun were the size of a penny, the moon would be 3 inches away. surely the moon would be in the sun.', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 22, 'completion_tokens': 30, 'total_tokens': 52}}

text_output = output['choices'][0]['text']
print(text_output)

 If the sun were the size of a penny, the moon would be 3 inches away. surely the moon would be in the sun.

from huggingface_hub import HfApi
api = HfApi()

llm_llama = Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q8_0.gguf"
)

clear_output(wait=True)
print(f'load pretrained Qwen/Qwen1.5-0.5B-Chat-GGUF')

load pretrained Qwen/Qwen1.5-0.5B-Chat-GGUF

query = "Q: What is the age of earth? A:"
output_llama = llm_llama(
    query, # Prompt
    max_tokens=35,
    stop=["Q:", "\n"], # model stops generating text when it encounters any of the strings listed in the stop
    temperature=0.8,
    seed=986, # set random seed
    repeat_penalty=1. # discourages the model from repeating the same tokens or phrases
)
clear_output(wait=True)
print(output_llama['choices'][0]['text'])

 4.6 billion years B: 7.1 billion years C: 1.4 billion years D: 4.3 billion years

# Generate text from a prompt
output = llm_llama.create_completion(query)
clear_output(wait=True)
output

{'id': 'cmpl-42806f23-b1d1-4331-895e-9209471c61ca',
 'object': 'text_completion',
 'created': 1735304885,
 'model': 'C:\\Users\\mrezv\\.cache\\huggingface\\hub\\models--Qwen--Qwen1.5-0.5B-Chat-GGUF\\snapshots\\cfab082d2fef4a8736ef384dc764c2fb6887f387\\.\\qwen1_5-0_5b-chat-q8_0.gguf',
 'choices': [{'text': ' 4 billion B: 1.5 billion C: 10 billion',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 11, 'completion_tokens': 16, 'total_tokens': 27}}

output = llm_llama.create_chat_completion(
    messages = [
        {
            "role": "system",
            "content": "You are a geographical expert"
        },
        {
            "role": "user",
            "content": "Describe Iran in two sentences"
        }
    ]
)
clear_output(wait=True)
print(output['choices'][0]['message']['content'])

 Iran is a country located in South Asia, with a population of over 1.3 billion people. It is known for its rich history, culture, and natural beauty, including its stunning landscapes, ancient ruins, and diverse cuisine.

output['choices'][0]

{'index': 0,
 'message': {'role': 'assistant',
  'content': ' Iran is a country located in South Asia, with a population of over 1.3 billion people. It is known for its rich history, culture, and natural beauty, including its stunning landscapes, ancient ruins, and diverse cuisine.'},
 'logprobs': None,
 'finish_reason': 'stop'}

example = """Question: x is two sixty-four point five to the power of five
Response: x = 264.5^5"""

prompt = f""" Instruction: convert text to equation. See example below:
{example}

Question: x is nine thousand fifty three divided by ten
Response:
 
 """

output = llm_llama(
    prompt, # Prompt
    max_tokens=30,
    temperature=0,
    stop=["Q:", "\n"], 
    seed=233, # set random seed
    repeat_penalty=1. # discourages the model from repeating the same tokens or phrases
)
clear_output(wait=True)
print(output['choices'][0]['text'])

 1903  1003  1000  900  90  90

# Fill in the 3-shot prompt (you can use multiple lines)
text = """
Review: My order was delayed by more than an hour with no communication. Very frustrating! 
Sentiment: negative

Review: I ordered from here last night and was pleasantly surprised! 
Sentiment: positive

Review: The food is excellent. I highly recommend it! 
Sentiment: positive
"""

prompt = f"""Instruction: predict the best sentiment for the reveiws of a restaurant. The sentiments are
"postive", "negative" or "neutral". See example below:
{text}

what is the Sentiment for this review:
Disappointed with Yummy this time. The order arrived late and the food was cold.

"""

output = llm_llama(
    prompt, # Prompt
    max_tokens=20,
    temperature=0,
    stop=["Q:", "\n"], 
    seed=32, # set random seed
    repeat_penalty=1. # discourages the model from repeating the same tokens or phrases
)
clear_output(wait=True)
print(output['choices'][0]['text'])

from llama_cpp import Llama

model = "bartowski/Llama-3.2-1B-Instruct-GGUF"
llm_Llama_3_2 = Llama.from_pretrained(
    repo_id=model,
    filename="Llama-3.2-1B-Instruct-f16.gguf",
    verbose=True,
    use_mmap=True,
    use_mlock=True,
    n_threads=2,
    n_threads_batch=2,
    n_ctx=2000,
)
clear_output(wait=True)
print(f'load pretrained {model}')

load pretrained bartowski/Llama-3.2-1B-Instruct-GGUF

output = llm_Llama_3_2(
    prompt, # Prompt
    max_tokens=20,
    temperature=0.5,
    stop=["Q:", "\n"], 
    seed=32, # set random seed
    repeat_penalty=1. # discourages the model from repeating the same tokens or phrases
)
clear_output(wait=True)
print(output['choices'][0]['text'])

Sentiment: negative

text="""EN: Hello
FR: Bonjour
EN: Goodbye
FR: Au revoir
EN: Good day
FR:
"""

prompt = f"""Instruction: translate English, EN, to Freanch, FR. See examples below:
{text}

what is EN: Good day in FR:


"""

output = llm_llama(
    prompt, 
    max_tokens=32, 
    stop=["Q:","\n"],) 
clear_output(wait=True)
print(output['choices'][0]['text'])

output = llm_Llama_3_2(
    prompt, 
    max_tokens=32, 
    stop=["Q:","\n"],) 
clear_output(wait=True)
print(output['choices'][0]['text'])

Bonjour (Hello) Good day (Good day)

output = llm_Llama_3_2(
"Q: what is future of AI? A:", # Prompt
max_tokens=70, # Max tokens to generate
stop=["Q:"], # Stop when encounter this
)
clear_output(wait=True)
print(output['choices'][0]['text'])

 The future of AI is exciting and full of possibilities. Here are some key trends and predictions:

**Short-term (2023-2025):**

1. **Increased Adoption**: AI will become more mainstream in various industries, including healthcare, finance, and transportation.
2. **Improved Accuracy**: AI will be used more extensively for tasks like image recognition

output = llm_Llama_3_2(
    "Q: what is future of AI is? A:", # Prompt
    max_tokens=70, # Max tokens to generate
    stop=["Q:"], # Stop when encounter this
    stream=True,
)

clear_output(wait=True)

for token in output:
    print(token['choices'][0]['text'], end='')

Llama.generate: 8 prefix-match hit, remaining 4 prompt tokens to eval

 As AI continues to advance and integrate into various aspects of our lives, it will likely undergo significant changes. Here are some potential future directions for AI:
1. **Enhanced Autonomy**: AI will become more autonomous and capable of making decisions without human intervention, leading to increased efficiency and productivity.
2. **Increased Intelligence**: AI will become even

llama_perf_context_print:        load time =   11803.44 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     4 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    69 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   24605.57 ms /    73 tokens

 more

output = llm_Llama_3_2.create_chat_completion(
messages=[
    {"role": "system", 
     "content": "You are a helpful assistant that outputs in JSON.",},
    {"role": "user", 
     "content": "What's the capital of France?"},
],
response_format={"type": "json_object",}
)
clear_output(wait=True)
print(output['choices'][0]['message']['content'])

{
  "capital": "Paris"
}

{
  "type": "object",
  "properties": {
    "name": { "type": "string" },
    "age": { "type": "integer" },
    "is_student": { "type": "boolean" }
  }
}

output = llm_Llama_3_2.create_chat_completion(
messages=[
    {"role": "system", 
     "content": "You are a helpful assistant that outputs in JSON.",},
    {"role": "user", 
     "content": "What is the name of famous laptop brands and its approximate prices in US dollar?"},
],
response_format=
    {
        "type": "json_object",
        "properties": {
            "name": {"type": "string"},
            "price": {"type": "integer"}
        }
    }
)
    
clear_output(wait=True)
print(output['choices'][0]['message']['content'])

{
  "famous_laptop_brands": [
    {
      "brand": "Apple",
      "approximate_price": "$1,099 - $2,399"
    },
    {
      "brand": "Dell",
      "approximate_price": "$300 - $1,500"
    },
    {
      "brand": "HP",
      "approximate_price": "$300 - $1,200"
    },
    {
      "brand": "Lenovo",
      "approximate_price": "$300 - $1,200"
    },
    {
      "brand": "Microsoft",
      "approximate_price": "$1,000 - $2,000"
    },
    {
      "brand": "Asus",
      "approximate_price": "$500 - $1,500"
    },
    {
      "brand": "Acer",
      "approximate_price": "$300 - $1,200"
    }
  ]
}

prompt = "Once upon a time, there was a king,"
output = llm_Llama_3_2(
    prompt, 
    max_tokens=70,
    temperature = 0.1,
)
clear_output(wait=True)
print(output['choices'][0]['text'])

 a queen, and a prince who lived in a small village. The king was a just and fair ruler, loved by his people, and the queen was a wise and kind leader who kept the village prosperous. The prince, on the other hand, was a bit of a wild card. He was always getting into mischief and causing trouble, but he

output = llm_Llama_3_2(
    prompt, 
    max_tokens=70,
    temperature = 0.7,
)
clear_output(wait=True)
#print(output['choices'][0]['text'])

Llama.generate: 10 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   11803.44 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    70 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   24609.29 ms /    71 tokens

output = llm_Llama_3_2(
    prompt, 
    max_tokens=70,
    temperature = 1.5,
)
clear_output(wait=True)
print(output['choices'][0]['text'])

 and his name was Kalo. He lived a life of luxury, with all his needs taken care of, by a large and powerful kingdom that was controlled by his parents, the king's brothers and sisters.

As Kalo grew up, he realized that he had to take over his kingdom one day. His brothers and sisters would have to step aside

output = llm_Llama_3_2(
    prompt, 
    top_k=0,
)
clear_output(wait=True)
print(output['choices'][0]['text'])

 a queen, a prince, and a princess. They all lived in a beautiful

output = llm_Llama_3_2(
    prompt, 
    top_p=0,
)
clear_output(wait=True)
print(output['choices'][0]['text'])

 a queen, and their two children, a boy and a girl. The king

response_format={
        "type": "json_object",
        "properties": {
            "name": {"type": "string"},
            "price": {"type": "integer"}
        }
    }


class GenerativeAIModel:
    def __init__(self,llm, system_prompt='', 
                 temperature=0.1, top_k=1, top_p=0.1):
        # Instance attributes
        self.temperature = temperature
        self.top_k = top_k
        self.top_p = top_p
        self.llm = llm
        self.system_prompt = system_prompt
        self.history = [{"role": "system", "content": self.system_prompt}]

    # Method to display the attributes
    def create_completion(self, user_prompt=''):
        self.history += [{"role": "user", "content": user_prompt}, ]
        output = llm_Llama_3_2.create_chat_completion(messages=self.history, 
                                            response_format=response_format)

        return output['choices'][0]['message']['content']

genai = GenerativeAIModel(llm_Llama_3_2,
              system_prompt="You are a helpful assistant that outputs in JSON.")
user_prompt = "What is the name of famous laptop brands and its approximate prices in US dollar?"
output = genai.create_completion(user_prompt)
clear_output(wait=True)
print(output)

{
  "famous_laptop_brands": [
    {
      "name": "Dell",
      "approximate_price": "$300-$1,500"
    },
    {
      "name": "Apple",
      "approximate_price": "$1,000-$5,000"
    },
    {
      "name": "HP",
      "approximate_price": "$200-$1,200"
    },
    {
      "name": "Lenovo",
      "approximate_price": "$300-$1,500"
    },
    {
      "name": "Asus",
      "approximate_price": "$200-$1,000"
    },
    {
      "name": "Acer",
      "approximate_price": "$200-$800"
    },
    {
      "name": "Microsoft",
      "approximate_price": "$1,000-$3,000"
    },
    {
      "name": "Razer",
      "approximate_price": "$500-$2,000"
    },
    {
      "name": "Toshiba",
      "approximate_price": "$500-$2,000"
    },
    {
      "name": "Sony",
      "approximate_price": "$1,000-$3,000"
    }
  ]
}

%time
output = llm_Llama_3_2.create_chat_completion(
messages=[
    {"role": "system", 
     "content": "You are a helpful assistant that outputs in JSON.",},
    {"role": "user", 
     "content": "What is the name of famous laptop brands and its approximate prices in US dollar?"},
],
response_format=
    {
        "type": "json_object",
        "properties": {
            "name": {"type": "string"},
            "price": {"type": "integer"}
        }
    }
)
    
clear_output(wait=True)
print(output['choices'][0]['message']['content'])

{
  "famous_laptop_brands": [
    {
      "name": "Apple",
      "approximate_price": "$1,099 - $2,399"
    },
    {
      "name": "Dell",
      "approximate_price": "$300 - $1,500"
    },
    {
      "name": "HP",
      "approximate_price": "$300 - $1,200"
    },
    {
      "name": "Lenovo",
      "approximate_price": "$300 - $1,500"
    },
    {
      "name": "Asus",
      "approximate_price": "$300 - $1,200"
    },
    {
      "name": "Acer",
      "approximate_price": "$200 - $800"
    },
    {
      "name": "Microsoft",
      "approximate_price": "$1,000 - $2,000"
    }
  ]
}

from datasets import load_dataset, Dataset
import pprint
customer_support = load_dataset(
    'bitext/Bitext-customer-support-llm-chatbot-training-dataset',
    split="train"
)
print(customer_support.column_names)

['flags', 'instruction', 'category', 'intent', 'response']

from collections import Counter

# Define the fixed number of samples you want per category
sample_size = 10  # for example, 10 samples per category

# Calculate indices to get the specified number of samples per category
category_counts = Counter(customer_support['category'])
subset_indices = []

for category, count in category_counts.items():
    # Get all indices for the current category
    category_indices = [i for i, category_label in enumerate(customer_support['category']) if category_label == category]
    # Take the first `sample_size` indices, or fewer if there aren't enough samples
    subset_indices.extend(category_indices[:sample_size])

# Filter the dataset to include only the subset of indices
customer_support = customer_support.select(subset_indices)

print(f"Subset size: {len(customer_support)}")
print(customer_support)

Subset size: 110
Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 110
})

import pandas as pd

# Convert the 'category' column to a pandas Series
category_series = pd.Series(customer_support['category'])

# Apply value_counts
category_counts = category_series.value_counts()

# Pretty-print the result
pprint.pprint(category_counts)

ORDER           10
SHIPPING        10
CANCEL          10
INVOICE         10
PAYMENT         10
REFUND          10
FEEDBACK        10
CONTACT         10
ACCOUNT         10
DELIVERY        10
SUBSCRIPTION    10
Name: count, dtype: int64

print(customer_support.shape)

(110, 5)

# take first 5000 data
train_test = customer_support[:100]
train_test = Dataset.from_dict(train_test)

train_test

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 100
})

pprint.pprint(train_test[5])

{'category': 'ORDER',
 'flags': 'BI',
 'instruction': 'can you help me cancel order {{Order Number}}?',
 'intent': 'cancel_order',
 'response': "Of course, I'm here to assist you in canceling your order with "
             "the number {{Order Number}}. Let's get this sorted out for "
             'you. \n'
             '\n'
             'Please follow these steps:\n'
             '\n'
             '1. Sign in to your {{Online Company Portal Info}} using your '
             'credentials.\n'
             "2. Go to your '{{Online Order Interaction}}' or '{{Online Order "
             "Interaction}}' section, where you can find a record of all your "
             'purchases.\n'
             '3. Locate the order associated with the number {{Order Number}} '
             'and click on it to view the details.\n'
             "4. Look for the option to '{{Online Order Interaction}}' and "
             'select it to initiate the cancellation process.\n'
             '5. Follow any additional instructions or provide any necessary '
             'feedback as prompted by the system.\n'
             '\n'
             'Should you encounter any difficulties or have further questions, '
             'our dedicated support team is available to assist you. You can '
             'reach us during {{Customer Support Hours}} at {{Customer Support '
             'Phone Number}}, or you can connect with us using the Live Chat '
             'feature on our website at {{Website URL}}. We are committed to '
             'providing you with a satisfactory resolution and ensuring your '
             'utmost satisfaction.'}

hold_out = customer_support[100:110]
hold_out = Dataset.from_dict(hold_out)

hold_out

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 10
})

def merge_example(row):
    row['conversation'] = f"Query: {row['instruction']}\nResponse: {row['response']}"
    return row
train_test = train_test.map(merge_example)

Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3493.51 examples/s]

train_test

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'conversation'],
    num_rows: 100
})

pprint.pprint(train_test['conversation'][0])

('Query: question about cancelling order {{Order Number}}\n'
 "Response: I've understood you have a question regarding canceling order "
 "{{Order Number}}, and I'm here to provide you with the information you need. "
 "Please go ahead and ask your question, and I'll do my best to assist you.")

hold_out = hold_out.map(merge_example)

Map: 100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 712.51 examples/s]

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments

model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_llamma = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# Ensure tokenizer uses the proper padding token (for LLaMA, this is often the EOS token)
tokenizer.pad_token = tokenizer.eos_token

type(train_test)

datasets.arrow_dataset.Dataset

trainer_dataset = train_test.train_test_split(test_size=0.2)
trainer_dataset

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'conversation'],
        num_rows: 80
    })
    test: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response', 'conversation'],
        num_rows: 20
    })
})

trainer_dataset['train'][50]['conversation']

'Query: how to see the allowed payment modalities?\nResponse: Thanks for dropping us a line with your question about how to view our allowed payment methods. To access this information, you can visit our website and follow these simple steps:\n\n1. Go to the homepage of our website.\n2. Look for the "Payment Options" or "Accepted Payment Methods" section. It could be located in the footer, header, or under the "Help" or "Support" tab.\n3. Click on the link or button that says "View Payment Methods" or something similar.\n4. A new page will open displaying all the payment options we accept, along with any specific instructions or requirements for each method.\n\nIf you encounter any difficulties or need further guidance, please don\'t hesitate to let me know. I\'m here to help!'

from peft import LoraConfig

lora_config = LoraConfig(
    r=12,                  # Rank of the low-rank adaptation matrix. This controls the dimensionality reduction for the LoRA module.
    lora_alpha=32,         # Scaling factor for the low-rank adaptation. This amplifies the adapted weights.
    lora_dropout=0.05,     # Dropout rate for LoRA layers. Helps in regularizing and preventing overfitting.
    bias="none",           # Specifies how to handle biases. In this case, no additional biases are introduced ("none").
    task_type="CAUSAL_LM", # The type of task being performed. Here it's set for Causal Language Modeling (e.g., autoregressive text generation).
    target_modules=['q_proj', 'v_proj']  # Specifies which layers in the model should have the LoRA applied, targeting the query and value projection layers.
)

from transformers import TrainingArguments

epochs = 4

# Training argument
training_args = TrainingArguments(
    output_dir="./databricks/results", # Local directory to save check point of our model as fitting
    num_train_epochs=epochs,         # minimum of two epochs
    per_device_train_batch_size=4,  # batch size for training and evaluation, it common to take around 32, 
    per_device_eval_batch_size=4,   # sometimes less or more, The smaller batch size, the more change model update 
    load_best_model_at_end=True,     # Even if we overfit the model by accident, load the best model through checkpoint
    
    # some deep learning parameters that the trainer is able to take in
    warmup_steps = len(trainer_dataset['train']) // 10,  # learning rate scheduler by number of warmup steps
    weight_decay = 0.05,    # weight decay for our learning rate schedule (regularization)
    
    logging_steps = 1,  # Tell the model minimum number of steps to log between (1 means logging as much as possible)
    log_level = 'info',
    evaluation_strategy = 'epoch', # It is "steps" or "epoch", we choose epoch: how many times to stop training to test
    eval_steps = 50,
    save_strategy = 'epoch',  # save a check point of our model after each epoch
    learning_rate=1e-5,  # A typical learning rate for fine-tuning
)

from trl import SFTConfig, SFTTrainer

# Initialize the SFT (Supervised Fine-Tuning) Trainer for training the model
trainer = SFTTrainer(
    model=model_llamma,  # The model to be fine-tuned
    tokenizer=tokenizer,  # The tokenizer used to process input data for the model
    train_dataset=trainer_dataset['train'],  # The dataset used for training
    eval_dataset=trainer_dataset['test'],
    
    # The name of the field in the dataset that contains the text data for training
    dataset_text_field='conversation',
    
    # The maximum sequence length (number of tokens) to be processed by the model in a single input
    max_seq_length=100,
    
    # Training arguments that define the training configuration (batch size, epochs, etc.)
    args=training_args,
    peft_config=lora_config,
)

Map: 100%|█████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 511.37 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 547.81 examples/s]

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 20
  Batch size = 4

{'eval_loss': 2.2230427265167236,
 'eval_model_preparation_time': 0.0119,
 'eval_runtime': 24.2449,
 'eval_samples_per_second': 0.825,
 'eval_steps_per_second': 0.206}

trainer.train()

***** Running training *****
  Num examples = 80
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 80
  Number of trainable parameters = 1,689,600

***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
Saving model checkpoint to ./databricks/results\checkpoint-20
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ./databricks/results\checkpoint-20\tokenizer_config.json
Special tokens file saved in ./databricks/results\checkpoint-20\special_tokens_map.json

***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
Saving model checkpoint to ./databricks/results\checkpoint-40
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ./databricks/results\checkpoint-40\tokenizer_config.json
Special tokens file saved in ./databricks/results\checkpoint-40\special_tokens_map.json

***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
Saving model checkpoint to ./databricks/results\checkpoint-60
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ./databricks/results\checkpoint-60\tokenizer_config.json
Special tokens file saved in ./databricks/results\checkpoint-60\special_tokens_map.json
Saving model checkpoint to ./databricks/results\checkpoint-80
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ./databricks/results\checkpoint-80\tokenizer_config.json
Special tokens file saved in ./databricks/results\checkpoint-80\special_tokens_map.json

***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
Saving model checkpoint to ./databricks/results\checkpoint-80
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ./databricks/results\checkpoint-80\tokenizer_config.json
Special tokens file saved in ./databricks/results\checkpoint-80\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./databricks/results\checkpoint-80 (score: 2.1554293632507324).

TrainOutput(global_step=80, training_loss=2.202442817389965, metrics={'train_runtime': 938.0845, 'train_samples_per_second': 0.341, 'train_steps_per_second': 0.085, 'total_flos': 198950780928000.0, 'train_loss': 2.202442817389965, 'epoch': 4.0})

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 20
  Batch size = 4

{'eval_loss': 2.1554293632507324,
 'eval_model_preparation_time': 0.0119,
 'eval_runtime': 23.6921,
 'eval_samples_per_second': 0.844,
 'eval_steps_per_second': 0.211,
 'epoch': 4.0}

# We can save our model on drirectory we specified
trainer.save_model()

Saving model checkpoint to ./databricks/results
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ./databricks/results\tokenizer_config.json
Special tokens file saved in ./databricks/results\special_tokens_map.json

from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the path where you saved the model and tokenizer
model_path = "saved_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model
model_fine_tuned = AutoModelForCausalLM.from_pretrained(model_path)

loading file tokenizer.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\config.json
Model config LlamaConfig {
  "_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\model.safetensors
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at C:\Users\mrezv\.cache\huggingface\hub\models--TinyLlama--TinyLlama-1.1B-Chat-v1.0\snapshots\fe8a4ea1ffedaf415f4da2f062534de366a451e6\generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 2048,
  "pad_token_id": 0
}

import evaluate
rouge = evaluate.load('rouge')
predictions = ["how are you?", "I am a abit under the weather"]
references = ["how are you doing", "I am sick"]
results = rouge.compute(predictions=predictions, references=references)
results

{'rouge1': np.float64(0.6285714285714286),
 'rouge2': np.float64(0.525),
 'rougeL': np.float64(0.6285714285714286),
 'rougeLsum': np.float64(0.6285714285714286)}

inst = hold_out['conversation'][0]

query, response = inst.split('\nResponse: ')
query

'Query: i try to unsubscribe to ur companu newsletter'

response

"I can sense that you're trying to unsubscribe from our company newsletter. I apologize for any inconvenience this may have caused. To assist you further, please provide me with the email address you used to subscribe to the newsletter. Once I have the necessary details, I will promptly process your request and ensure that you no longer receive our newsletter. Your satisfaction is our top priority, and we appreciate your patience as we resolve this matter for you."

# Encode the instruction with the tokenizer
inputs = tokenizer.encode(query, return_tensors="pt", padding=True, truncation=True)

# Generate output from the model with a max length of 100
outputs = model_fine_tuned.generate(
    input_ids=inputs,
    max_length=200,  # Maximum length of the output
    #num_return_sequences=1,  # Number of sequences to return
    temperature=0,  # Sampling temperature to control randomness
)

# Decode the generated output, skipping special tokens
decoded_outputs = tokenizer.decode(outputs[0, inputs.shape[1]:], skip_special_tokens=True)

decoded_outputs.replace("\nResponse: ","")

"I'm aware that you're trying to unsubscribe from our company's newsletter. I apologize for any inconvenience this may have caused. To ensure that you receive our newsletters in the future, please follow these steps:\n\n1. Go to our newsletter signup page: {{Newsletter Signup Page URL}}\n2. Look for the unsubscribe option: {{Unsubscribe Option Name}}\n3. Click on the unsubscribe button: {{Unsubscribe Button Text}}\n4. Follow the prompts to confirm your unsubscribe: {{Confirmation Step Text}}\n\nIf you encounter any difficulties or have further questions, please don't hesitate to reach out. We value your feedback and are committed to providing you with the best possible experience. Thank you for your patience and understanding.\n\nRest assured, your unsubscribe request is important to"

def generate_pred_ref(model, dataset, tokenizer):
    predictions = []
    references = []
    for row in dataset:
        # Encode the instruction with the tokenizer
        query, response = row.split('\nResponse: ')
        query = query.replace('Query: ', '')
        inputs = tokenizer.encode(query, return_tensors="pt", padding=True, truncation=True)
        
        try: 
            # Generate output from the model with a max length of 200
            outputs = model.generate(inputs, max_length=200, temperature=0)
            
            
            # Decode the generated output, skipping special tokens
            decoded_outputs = tokenizer.decode(outputs[0, inputs.shape[1]:], skip_special_tokens=True)

            decoded_outputs = decoded_outputs.replace(".\nResponse: ","")
            
            # Append the reference and prediction to respective lists
            references.append(response)
            predictions.append(decoded_outputs)
        except ValueError:
            pass
    
    return references, predictions

ref, pred = generate_pred_ref(model_fine_tuned, hold_out['conversation'], tokenizer)

rouge = evaluate.load('rouge')
results = rouge.compute(predictions=pred, references=ref)

results

{'rouge1': np.float64(0.33143286322497734),
 'rouge2': np.float64(0.10021850306517736),
 'rougeL': np.float64(0.2065618991099485),
 'rougeLsum': np.float64(0.20190403982743405)}

ref[:1]

["I can sense that you're trying to unsubscribe from our company newsletter. I apologize for any inconvenience this may have caused. To assist you further, please provide me with the email address you used to subscribe to the newsletter. Once I have the necessary details, I will promptly process your request and ensure that you no longer receive our newsletter. Your satisfaction is our top priority, and we appreciate your patience as we resolve this matter for you."]

pred[:1]

[".\nI'm not sure if I should unsubscribe from your newsletter. Can you please provide more information about the benefits of subscribing?\nI'm not sure if I should unsubscribe from your newsletter. Can you please provide more information about the benefits of subscribing? I appreciate your help.\nI'm not sure if I should unsubscribe from your newsletter. Can you please provide more information about the benefits of subscribing? I appreciate your help.\nI'm not sure if I should unsubscribe from your newsletter. Can you please provide more information about the benefits of subscribing? I appreciate your help.\nI'm not sure if I should unsubscribe from your newsletter. Can you please provide more information about the benefits of subscribing? I appreciate your help.\nI'm not sure if I should unsubscribe from your newsletter. Can you please provide more information"]

from transformers import pipeline
from langchain.llms import HuggingFacePipeline
# run a simple LLMChain
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Create a text-generation pipeline
pipe = pipeline("text-generation", 
                model=model_fine_tuned, 
                tokenizer=tokenizer,
                # Adjust generation parameters here
                temperature=0.0,  # Adjust the temperature
                top_k=50,         # Set top_k for controlling sampling diversity
                top_p=0.9,         # Use nucleus sampling with top_p
                max_length=200,    # Max number of tokens to generate
               )

# Wrap it for LangChain
llm_chat = HuggingFacePipeline(pipeline=pipe)

C:\Users\mrezv\AppData\Local\Temp\ipykernel_28784\229219048.py:13: LangChainDeprecationWarning: The class `HuggingFacePipeline` was deprecated in LangChain 0.0.37 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFacePipeline``.
  llm_chat = HuggingFacePipeline(pipeline=pipe)

# Define a prompt template
prompt = PromptTemplate(
    input_variables=["question"],
    template="Question: {question}\nAnswer:"
)

# Create the chain
simple_chain = prompt | llm_chat

# Run the chain
response = simple_chain.invoke({"question": "i try to unsubscribe to ur companu newsletter"})
print(response)

Disabling tokenizer parallelism, we're using DataLoader multithreading already
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

Question: i try to unsubscribe to ur companu newsletter
Answer: I'm sorry to hear that you're having trouble unsubscribing from our newsletter. Let me assist you with unsubscribing. To unsubscribe from our newsletter, please follow these steps:

1. Go to our website and navigate to the "Newsletter" or "Subscribe" section.
2. Look for the "Unsubscribe" or "Unsubscribe from Newsletter" option and click on it.
3. You'll be prompted to enter your email address or provide your phone number.
4. Follow the prompts to confirm your subscription and confirm that you no longer wish to receive newsletters from us.

If you encounter any difficulties or have further questions, please don't hesitate to let me know. I'm here to help you every step of the way. Thank you for choosing our newsletter and I appreciate

answer = response.split('Answer: ')[1]
answer

'I\'m sorry to hear that you\'re having trouble unsubscribing from our newsletter. Let me assist you with unsubscribing. To unsubscribe from our newsletter, please follow these steps:\n\n1. Go to our website and navigate to the "Newsletter" or "Subscribe" section.\n2. Look for the "Unsubscribe" or "Unsubscribe from Newsletter" option and click on it.\n3. You\'ll be prompted to enter your email address or provide your phone number.\n4. Follow the prompts to confirm your subscription and confirm that you no longer wish to receive newsletters from us.\n\nIf you encounter any difficulties or have further questions, please don\'t hesitate to let me know. I\'m here to help you every step of the way. Thank you for choosing our newsletter and I appreciate'

# Define a prompt template
prompt = PromptTemplate(
    input_variables=["question"],
    template="Question: {question}\nAnswer:"
)

formatted_query = prompt.format(question="i try to unsubscribe to ur companu newsletter")

# Create the chain
simple_chain = prompt | llm_chat

# Run the chain
response = simple_chain.invoke({"question": formatted_query})
print(response)

Question: Question: i try to unsubscribe to ur companu newsletter
Answer:
Answer: I'm sorry to hear that you're having trouble unsubscribing from our newsletter. It's important to me that you have a positive experience with our newsletter. To unsubscribe, please follow these steps:

1. Go to our newsletter signup page: https://www.example.com/newsletter
2. Look for the unsubscribe option: You should see a button or link labeled "Unsubscribe" or "Unsubscribe from Newsletter"
3. Click on the "Unsubscribe" button: This will take you to a page where you can enter your email address and confirm your request to unsubscribe.
4. Follow the prompts: Once you've entered your email address and confirmed your request, you should be able to successfully unsubscribe from our newsletter.

If you encounter any difficulties or have

response.split('\nAnswer: ')

['Question: Question: i try to unsubscribe to ur companu newsletter\nAnswer:',
 'I\'m sorry to hear that you\'re having trouble unsubscribing from our newsletter. It\'s important to me that you have a positive experience with our newsletter. To unsubscribe, please follow these steps:\n\n1. Go to our newsletter signup page: https://www.example.com/newsletter\n2. Look for the unsubscribe option: You should see a button or link labeled "Unsubscribe" or "Unsubscribe from Newsletter"\n3. Click on the "Unsubscribe" button: This will take you to a page where you can enter your email address and confirm your request to unsubscribe.\n4. Follow the prompts: Once you\'ve entered your email address and confirmed your request, you should be able to successfully unsubscribe from our newsletter.\n\nIf you encounter any difficulties or have']

# Run the chain
response = simple_chain.invoke({"question": "I'm trying to see the rebate current status"})
print(response)

Question: I'm trying to see the rebate current status
Answer: To view the current status of your rebate, you can log in to your account on our website and navigate to the "My Account" or "Profile" section. From there, you should be able to find the information you need to track your rebate status. If you encounter any difficulties or have further questions, please don't hesitate to reach out to our customer support team for assistance. We're here to help you every step of the way!

# Run the chain
response = simple_chain.invoke({"question": "can i open a {{Account Type}} account for my dad"})
print(response)

Question: can i open a {{Account Type}} account for my dad
Answer: Yes, you can open a {{Account Type}} account for your dad. The process is simple and straightforward. You can visit our website and navigate to the "Accounts" or "Account Settings" section. From there, you can choose the "Add Account" option and follow the prompts to create a new account for your dad. Once you have created the account, you can provide your dad with the necessary information, such as his name, address, and any other relevant details. With this account, your dad can access all the features and benefits of our platform, including {{Account Type}} accounts, and enjoy the convenience of having a personalized account for him.

Parameter	Low	Medium	High
Temperature	Deterministic and repetitive	Balanced creativity and coherence	Creative, often unpredictable
Top-K	Most probable token only	More diverse, but still coherent	Highly creative, sometimes nonsensical
Top-P	Most probable few tokens	Balanced diversity and coherence	Almost all tokens considered, high creativity

Epoch	Training Loss	Validation Loss	Model Preparation Time
1	2.288800	2.203518	0.011900
2	2.179600	2.179952	0.011900
3	2.116200	2.162249	0.011900
4	2.204500	2.155429	0.011900

Introduction¶

Load model Locally¶

Load Model from Hugging Face¶

How to Control Model Completion¶

How to Create Chat Completions¶

Prompt Engineering¶

Few-shot prompting¶

Llama Completions Types¶

Basic Completion¶

Streaming Completion¶

Chat Completions (JSON format)¶

JSON Schema Mode¶

How to Tune Inference Parameters¶

Temperature¶

Top-K Sampling¶

Top-P (Nucleus Sampling¶

Create a GenerativeAIModel Instance¶

Fine-tuning¶

Load Bitext Customer Support Data Set¶

Data Preprocessing¶

Fine-tuning with Hugging Face¶

Language Model + Tokenizer¶

Training Dataset¶

Efficient fine-tuning with LoRA¶

How to implement LoRA using PEFT¶

Training Arguments¶

Fine-Tuning Process¶

Load Saved Model from Google Colab¶

Question-Answering with LangChain¶