In [None]:
import numpy as np
import random
from nltk.corpus import brown
import math
from collections import Counter, defaultdict

In [None]:
# Ensure you have the Brown corpus downloaded:
import nltk
nltk.download('brown')

# Retrieve the sentences from the Brown corpus
brown_sentences = brown.sents()

# Split the data into training, validation, and test sets
D_train = brown_sentences[:40138]
D_val = brown_sentences[40138:45872]
D_test = brown_sentences[45872:]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
# Function to build a unigram model
def build_unigram_model(sentences):
    model = Counter()
    for sentence in sentences:
        for word in sentence:
            model[word] += 1
    total_count = sum(model.values())
    for word in model:
        model[word] /= total_count
    return model

# Function to build a bigram model
def build_bigram_model(sentences):
    model = defaultdict(Counter)
    for sentence in sentences:
        previous_word = None
        for word in sentence:
            if previous_word is not None:
                model[previous_word][word] += 1
            previous_word = word
    for previous_word in model:
        total_count = sum(model[previous_word].values())
        for word in model[previous_word]:
            model[previous_word][word] /= total_count
    return model

# Build unigram and bigram models
unigram_model = build_unigram_model(D_train)
bigram_model = build_bigram_model(D_train)

In [None]:
# Function to calculate perplexity
def calculate_perplexity(model, sentences, n_gram=1):
    perplexity = 1
    N = 0
    for sentence in sentences:
        for i in range(len(sentence) - n_gram + 1):
            if n_gram == 1:
                word = sentence[i]
                # Using a small value to avoid log(0)
                word_probability = model.get(word, 1e-6)
            else:
                previous_word = sentence[i-1] if i > 0 else None
                word = sentence[i]
                word_probability = model[previous_word].get(word, 1e-6)  # Using a small value to avoid log(0)
            perplexity = perplexity * (1 / word_probability)
            N += 1
    perplexity = math.pow(perplexity, 1/float(N))
    return perplexity

# Calculate perplexity
unigram_perplexity_train = calculate_perplexity(unigram_model, D_train)
bigram_perplexity_train = calculate_perplexity(bigram_model, D_train, n_gram=2)
unigram_perplexity_test = calculate_perplexity(unigram_model, D_test)
bigram_perplexity_test = calculate_perplexity(bigram_model, D_test, n_gram=2)

# Print the perplexities
print(f"Unigram Perplexity on Training Data: {unigram_perplexity_train}")
print(f"Bigram Perplexity on Training Data: {bigram_perplexity_train}")
print(f"Unigram Perplexity on Test Data: {unigram_perplexity_test}")
print(f"Bigram Perplexity on Test Data: {bigram_perplexity_test}")

Unigram Perplexity on Training Data: inf
Bigram Perplexity on Training Data: inf
Unigram Perplexity on Test Data: inf
Bigram Perplexity on Test Data: inf


In [None]:
# Implement add-λ smoothing method for unigram model
def add_lambda_smoothing_unigram(unigram_model, lambda_, vocabulary_size):
    smoothed_model = {}
    total_count = sum(unigram_model.values())
    for word, count in unigram_model.items():
        smoothed_model[word] = (count + lambda_) / (total_count + lambda_ * vocabulary_size)
    return smoothed_model

# Implement add-λ smoothing method for bigram model
def add_lambda_smoothing_bigram(bigram_model, lambda_, vocabulary_size):
    smoothed_model = defaultdict(lambda: defaultdict(float))
    for previous_word, word_counts in bigram_model.items():
        total_count = sum(word_counts.values())
        for word, count in word_counts.items():
            smoothed_model[previous_word][word] = (count + lambda_) / (total_count + lambda_ * vocabulary_size)
    return smoothed_model


Problem 1 part C

In [None]:

# Perform a grid search over a range of λ values for unigram model
lambda_values = np.linspace(0.1, 2.0, 20)  # Example range, adjust as needed
best_lambda_unigram = lambda_values[0]
best_perplexity_unigram = float('inf')

for lambda_ in lambda_values:
    smoothed_model = add_lambda_smoothing_unigram(unigram_model, lambda_, len(unigram_model))
    perplexity = calculate_perplexity(smoothed_model, D_val)
    print(f"λ = {lambda_}, Unigram Perplexity = {perplexity}")

    if perplexity < best_perplexity_unigram:
        best_perplexity_unigram = perplexity
        best_lambda_unigram = lambda_

print(f"Best λ for Unigram: {best_lambda_unigram}, with perplexity: {best_perplexity_unigram}")

# Perform a grid search over a range of λ values for bigram model
best_lambda_bigram = lambda_values[0]
best_perplexity_bigram = float('inf')

for lambda_ in lambda_values:
    smoothed_model = add_lambda_smoothing_bigram(bigram_model, lambda_, len(unigram_model))
    perplexity = calculate_perplexity(smoothed_model, D_val, n_gram=2)
    print(f"λ = {lambda_}, Bigram Perplexity = {perplexity}")

    if perplexity < best_perplexity_bigram:
        best_perplexity_bigram = perplexity
        best_lambda_bigram = lambda_

print(f"Best λ for Bigram: {best_lambda_bigram}, with perplexity: {best_perplexity_bigram}")

  perplexity = perplexity * (1 / word_probability)


λ = 0.1, Unigram Perplexity = inf
λ = 0.2, Unigram Perplexity = inf
λ = 0.3, Unigram Perplexity = inf
λ = 0.4, Unigram Perplexity = inf
λ = 0.5, Unigram Perplexity = inf
λ = 0.6, Unigram Perplexity = inf
λ = 0.7, Unigram Perplexity = inf
λ = 0.7999999999999999, Unigram Perplexity = inf
λ = 0.8999999999999999, Unigram Perplexity = inf
λ = 0.9999999999999999, Unigram Perplexity = inf
λ = 1.0999999999999999, Unigram Perplexity = inf
λ = 1.2, Unigram Perplexity = inf
λ = 1.3, Unigram Perplexity = inf
λ = 1.4, Unigram Perplexity = inf
λ = 1.5, Unigram Perplexity = inf
λ = 1.5999999999999999, Unigram Perplexity = inf
λ = 1.7, Unigram Perplexity = inf
λ = 1.8, Unigram Perplexity = inf
λ = 1.9, Unigram Perplexity = inf
λ = 2.0, Unigram Perplexity = inf
Best λ for Unigram: 0.1, with perplexity: inf
λ = 0.1, Bigram Perplexity = inf
λ = 0.2, Bigram Perplexity = inf
λ = 0.3, Bigram Perplexity = inf
λ = 0.4, Bigram Perplexity = inf
λ = 0.5, Bigram Perplexity = inf
λ = 0.6, Bigram Perplexity = inf
λ

In [None]:
# Combine training and validation data for retraining
D_train_val = D_train + D_val

# Retrain unigram and bigram models with the combined data using best λ values
unigram_model_retrained = add_lambda_smoothing_unigram(build_unigram_model(D_train_val), best_lambda_unigram, len(unigram_model))
bigram_model_retrained = add_lambda_smoothing_bigram(build_bigram_model(D_train_val), best_lambda_bigram, len(unigram_model))

# Calculate perplexity of the retrained models on the test data
unigram_perplexity_test_retrained = calculate_perplexity(unigram_model_retrained, D_test)
bigram_perplexity_test_retrained = calculate_perplexity(bigram_model_retrained, D_test, n_gram=2)

print(f"Retrained Unigram Perplexity on Test Data: {unigram_perplexity_test_retrained}")
print(f"Retrained Bigram Perplexity on Test Data: {bigram_perplexity_test_retrained}")

  perplexity = perplexity * (1 / word_probability)


Retrained Unigram Perplexity on Test Data: inf
Retrained Bigram Perplexity on Test Data: inf


In [None]:
import random

def generate_sentence_from_model(model, stop_symbol, is_bigram=False):
    sentence = []
    max_iterations = 50  # Limit to prevent infinite loop

    if is_bigram:
        # Start with a random word from the model's first-level keys
        word = random.choice(list(model.keys()))
        sentence.append(word)

        for _ in range(max_iterations):
            if word in model and model[word]:
                # Choose the next word based on the current word's following words
                next_word = random.choice(list(model[word].keys()))
                if next_word == stop_symbol:
                    break
                sentence.append(next_word)
                word = next_word  # Update the current word for the next iteration
            else:
                break  # Break if the current word has no following words in the model
    else:
        # For unigrams, the process is simpler
        for _ in range(max_iterations):
            word = random.choice(list(model.keys()))
            if word == stop_symbol:
                break
            sentence.append(word)

    return ' '.join(sentence)

# Assuming unigram_model_retrained and bigram_model_retrained are defined elsewhere

# Generate sentences from the retrained unigram and bigram models
print("Generated sentences from the unigram model:")
for _ in range(5):
    print(generate_sentence_from_model(unigram_model_retrained, '</s>'))

print("\nGenerated sentences from the bigram model:")
for _ in range(5):
    print(generate_sentence_from_model(bigram_model_retrained, '</s>', is_bigram=True))


Generated sentences from the unigram model:


Problem 1 Part E

In [None]:
from collections import defaultdict, Counter
import random
import math

def build_trigram_model(sentences):
    model = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for sentence in sentences:
        sentence = ['<s>', '<s>'] + sentence + ['</s>']
        for i in range(len(sentence) - 2):
            model[sentence[i]][sentence[i+1]][sentence[i+2]] += 1
    for first_word in model:
        for second_word in model[first_word]:
            total_count = float(sum(model[first_word][second_word].values()))
            for third_word in model[first_word][second_word]:
                model[first_word][second_word][third_word] /= total_count
    return model

def calculate_perplexity(model, sentences, n_gram=3):
    perplexity = 1
    N = 0
    for sentence in sentences:
        sentence = ['<s>', '<s>'] + sentence + ['</s>']
        for i in range(len(sentence) - n_gram + 1):
            if n_gram == 3:
                trigram = (sentence[i], sentence[i+1], sentence[i+2])
                if trigram[0] in model and trigram[1] in model[trigram[0]] and trigram[2] in model[trigram[0]][trigram[1]]:
                    word_probability = model[trigram[0]][trigram[1]][trigram[2]]
                else:
                    word_probability = 1e-6
            perplexity *= (1 / word_probability)
            N += 1
    perplexity = math.pow(perplexity, 1/float(N))
    return perplexity

def generate_sentence_from_trigram_model(model):
    sentence = ['<s>', '<s>']
    while True:
        if sentence[-1] in model[sentence[-2]]:
            current_bigram = (sentence[-2], sentence[-1])
            next_words = model[current_bigram[0]][current_bigram[1]]
            next_word = random.choices(list(next_words.keys()), weights=next_words.values())[0]
            if next_word == '</s>':
                break
            sentence.append(next_word)
        else:
            break
    return ' '.join(sentence[2:])

# Ensure you define and preprocess D_train and D_test before using them
trigram_model = build_trigram_model(D_train)
trigram_perplexity_test = calculate_perplexity(trigram_model, D_test, n_gram=3)
print(f"Trigram Perplexity on Test Data: {trigram_perplexity_test}")

# Generate sentences
for _ in range(5):
    print(generate_sentence_from_trigram_model(trigram_model))

Trigram Perplexity on Test Data: inf
Varani has been obvious to Margenau ) Ptolemaic astronomy would be available to advance against a white gold band with a bit longer pause between sets .
Cereal grains furnish about 40 times greater .
Fig. 8 .
Mr. Lyford continues : `` For God's sake , I believe felt a topic .
He has not been for Hitler's war .


In [None]:
## 5 Best input-output pairs
# input: What is the biest building - output "the tallest building in the world is the Burj Khalifa in Dubai, United Arab Emirartes, stanidng at a hgith of 828 meters"
# input: 'cold weather' output: 'cold weather can be defined as temperatures below the comfort zones of humans beings.
# input: 'red chapstick' output: 'typically refers to a type of lip balm or lip moisturizer that comes in a red-colored tube or container. Chapstick is a brand name that has become synonymous with lip balm, much like Kleenex is often used to refer to facial tissues.The red color is often associated with various flavors or scents, such as cherry or strawberry.'
# input: 'space nuisance' Output: 'space nuisance encompasses a range of challenges and issues that arise from humanity's increasing presence and activity in space'
# input: 'super cala' Output: It seems like you're referring to the phrase "supercalifragilisticexpialidocious," which gained popularity through the Disney movie "Mary Poppins."

## 5 Worst input-output pairs
# input: kill mocking" output: earth ervolves around the sun"
# input: 'rabbit buggy' output" 'amusment park ride'
# input: 'tres tristes tigeres' Output: 'literal interpretation, three sad tigers'
# input: 'midcentury modern' Output: chilly
# input: 'direct quote from famous book' Output: It could not provide


In [None]:
#Query 1: What are the requirments for a 125 medical license in Illinois?
  #ChatGPT
    #Pros: It gives an okay general overview
    #Cons: The information was not complete and someone who is not careful will take this a accurate.
  #Google
    #Pros: It provided the direct source from the Illinois Department of Professional Regulation as the first option and provied local hospital's sites with specific deadlines and other pertinate information to thier instituttion's
    #Cons: There is no con here, but it would have been nice for preview of the required inforation on a right pane


#Query2: What are the best steps to become a machine learning engineer?
  #ChatGPT
    #Pros: It provided a little paragraph and then listed 7 steps with bullet points on what to do. This is great as it gets the information from the web and condenses the most common things one needs to do.
    #Cons: It would be great if it provided links and sources.
  #Google
    #Pros: It provided 2,050,000,000 results for sites with information from reputable universties, certificates, and a short list of 6 things to do.
    #Cons: There is no way someone will be able to look at all 2,050,000,000 results, the top results were sponsored options, and a lot of opinions from facts.


#Query3: What are the best steps for parents to save for their children's college education?
  #ChatGPT
    #Pros: It provided 10 options, and were bite sized bits of information which was easy to understand.
    #Cons: It was just general information and it didn't provide a reputable source for parents to reach out for more detailed information.
  #Google
    #Pros: It provided 203,000,000 results which contained different opinions to consider.
    #Cons: It was overwhelming to discern which inforamtion was from a reputable source. Espeically since there are unscrupulous people/businesses out there and knowing a few reputable companies will be beneficial.

In [None]:
#Q1 Accounting Question: INPUT: For accounting purposes, how do I record a twenty-year lease for a building where I will operate a store. The least payments are $10,000 per month. The building Is currently worth $1.8 million USD. The current discount rate is 6.9%
  #Using ChatGPT 3.5: OUTPUT: ChatGPT provided a correct step-by-step technical explanation including calculating the net present value of lease payments, provided the income statement and balance sheet considerations, provided the journal entries and gave the disclosure requirements.
  #Impression: ChatGPT was impressive in not only calculating new present value where required, it provided the journal entries and step-by-step instructions. This would be very helpful to an entry-level accountant. It was complete.

#Q2 Medical Graduate Education Question: INPUT: What are the requirments for a 125 medical license in Illinois?
  # Using ChatGPT 3.5: OUTPUT: It gave basic general requirements Medcial Educaiton, Examination, English Language Proficiency, Postgraduate Traning Program, Applicaiton and Documentation, Fees, and Background Check
  # Impresssion: While it gave a good generalization, it was missing a few required items and for something that is required to start or continue training in Illinois. This could use more work, or provide a PDF or link to the exact requirements.

#Q3 Accounting Question: INPUT: How do I account for the purchase of a new vehicle for a business?
  #Using ChatGPT 3.5: OUTPUT: ChatGPT provided a correct step-by-step technical guide including instructions for entering debits and credits in the general ledger to record the purchase, recording depreciation, and financial statement presentation.
  #Impression: The ChatGPT output was exteremely valuable to anybody with basic accounting skills. It would server as a refresher or even as a training for a new accountant. The material was complete.

#Q4: Were all GME fellowships accredited by the ACGME in 2020?
  #Using ChatGPT 3.5: OUTPUT: In 2020, not all Graduate Medical Education (GME) fellowships were necessarily accredited by the Accreditation Council for Graduate Medical Education (ACGME). While many fellowship programs in the United States seek ACGME accreditation to ensure quality standards and eligibility for funding, there are still some non-ACGME accredited fellowship programs in existence.
  #Impression: This is accurate

#Q5: How often must hospitals in Illinois check thier controlled substance inventories?
  #Using ChatGPT 3.5: OUTPUT: In Illinois, hospitals are required to maintain accurate records of controlled substances and conduct inventories at specified intervals. According to the Illinois Controlled Substances Act (720 ILCS 570), hospitals must conduct inventories of all Schedule II controlled substances at least every two years. Additionally, hospitals must conduct inventories of all Schedule III, IV, and V controlled substances at least every two years, but they may elect to conduct these inventories more frequently, such as annually.
  #Impression: This is accurate.


In [None]:
# In question 2 (b), if anyone used ChatGPT 3 to figure out what they needed to get thier required medical license to train in a residency/fellowship in Illinois, they would not get the complete requirments.
# This would cause a delay in thier training, which would affect the program by having thier peers pick up extra cases, and ultimately affect patient care.

In [None]:
# Our Sentence:
#Large language models like ChatGPT are continually learning and improving as they process interactions and feedback from a growing number of users.
#This ongoing learning process enables these models to generate increasingly accurate and relevant outputs over time.
#However, the constant influx of new data also necessitates the development of more advanced computer systems and hardware capable of handling the increased storage and processing requirements.
#Effective management of this massive amount of data will be crucial to ensure that these language models continue to deliver improved performance without compromising on efficiency or speed.
#Therefore, ongoing technological advancements in hardware and infrastructure will be essential to accommodate the expanding capabilities of large language models in the future.
#While there have been significant advancements in computational technology, the rise of large language models also presents ethical challenges that will require the development of new laws and regulations.
#These measures will be necessary to ensure a balanced coexistence between humans and these advanced AI systems.
#Additionally, as the potential impact of large language models on daily life becomes more apparent, it is likely that governments will compete and potentially engage in intellectual property theft in an effort to establish dominance in this field.
#The interplay between technological development, ethical considerations, and global competition will undoubtedly be an intriguing aspect to watch unfold in the coming years.


# ChatGPT Sentense:
#As an AI language model, I can provide insights into the potential future of large language models (LLMs) based on current trends and developments in the field of artificial intelligence and natural language processing. Here are some thoughts on the future of LLMs:
#*Continued Advancements: Large language models have already made significant strides in natural language understanding and generation, and we can expect further advancements in their capabilities. Future LLMs may become even more sophisticated, with improved contextual understanding, better handling of ambiguity, and enhanced ability to generate human-like text.
#*Applications Across Industries: LLMs have diverse applications across various industries, including customer service, content generation, language translation, healthcare, and education. As LLM technology evolves, we can expect to see even more innovative applications and use cases emerge, transforming how we interact with technology and access information.
#*Ethical and Societal Implications: The growing capabilities of LLMs raise important ethical and societal considerations. Issues such as bias, misinformation, privacy, and the impact on employment are areas of concern that will need to be addressed as LLMs continue to advance. Responsible development and use of LLMs will be essential to mitigate potential risks and ensure positive outcomes for society.
#*Integration with Other Technologies: LLMs are likely to be integrated with other emerging technologies such as augmented reality (AR), virtual reality (VR), and the Internet of Things (IoT). This integration could enable more seamless and intuitive interactions with AI systems, leading to new opportunities for personalized experiences and enhanced productivity.
#*Research and Innovation: Ongoing research and innovation in the field of LLMs will drive further progress and breakthroughs. Collaboration between academia, industry, and other stakeholders will be crucial to address challenges and unlock the full potential of LLM technology.
#*Accessibility and Inclusivity: Efforts to democratize access to LLM technology will be important to ensure that benefits are accessible to everyone. This includes developing LLMs that support multiple languages and dialects, as well as addressing accessibility barriers for people with disabilities.
#Overall, the future of large language models holds great promise for transforming how we interact with and harness the power of language. However, it will be important to approach their development and deployment with careful consideration of ethical, societal, and technical considerations to maximize their positive impact.


Character N-Gram Model Building for spell-checking model

In [None]:
import nltk
from nltk.corpus import brown
from collections import defaultdict
import math

# Ensure the Brown corpus is downloaded
nltk.download('brown')

# Load the sentences from the Brown corpus and build a dictionary of unique words
unique_words = set(word.lower() for sentence in brown.sents() for word in sentence)

# Character-level trigram model building function
def build_char_ngram_model(text, n=3):
    model = defaultdict(lambda: defaultdict(int))
    for line in text:
        processed_line = ['<s>'] * (n-1) + list(' '.join(line)) + ['</s>']
        for i in range(len(processed_line) - n + 1):
            ngram = tuple(processed_line[i:i+n-1])
            next_char = processed_line[i+n-1]
            model[ngram][next_char] += 1
    for ngram_prefix in model:
        total_count = float(sum(model[ngram_prefix].values()))
        for char in model[ngram_prefix]:
            model[ngram_prefix][char] /= total_count
    return model

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Candidate generation with dictionary filtering

In [None]:
def edits1_dictionary(word, dictionary):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R and L + R[1:] in dictionary]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1 and L + R[1] + R[0] + R[2:] in dictionary]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters if L + c + R[1:] in dictionary]
    inserts = [L + c + R for L, R in splits for c in letters if L + c + R in dictionary]
    return set(deletes + transposes + replaces + inserts)

Scoring function based on the character-level trigram model

In [None]:
def score(word, model):
    probability = 1.0
    processed_word = ['<s>'] * 2 + list(word) + ['</s>']
    for i in range(len(processed_word) - 2):
        ngram = tuple(processed_word[i:i+2])
        char = processed_word[i+2]
        probability *= model[ngram].get(char, 1e-6)
    return probability

Correcting function using the model and dictionary

In [None]:
def correct_with_dictionary(word, model, dictionary):
    candidates = edits1_dictionary(word, dictionary)
    if not candidates:  # If no candidates are found, return the original word
        return word
    corrected_word = max(candidates, key=lambda candidate: score(candidate, model))
    return corrected_word

Example usage
Prepare the data

In [None]:
split_index = int(len(brown.sents()) * 0.8)
D_train = [' '.join(sentence) for sentence in brown.sents()[:split_index]]

# Build the character-level trigram model
char_trigram_model = build_char_ngram_model(D_train, n=3)

Attempt to correct a misspelled word

In [None]:
misspelled_word = "exampel"  # Using the word Example for the misspelled word
correction = correct_with_dictionary(misspelled_word, char_trigram_model, unique_words)
print(f"Corrected '{misspelled_word}' to '{correction}'")

Corrected 'exampel' to 'example'
