In [1]:
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import defaultdict, Counter
import random
import portalocker

In [2]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
from collections import Counter

tokenizer = get_tokenizer('basic_english')

# Define the iterator for building the vocabulary
def yield_tokens(data_iter, special_tokens):
    # Ensure special tokens are included in the first iteration
    for token in special_tokens:
        yield [token]
    for _, text in data_iter:
        yield tokenizer(text)

# Special tokens
special_tokens = ["<unk>", "<pad>", "<start>", "<end>"]

# Load the dataset iterator
train_iter = IMDB(split='train')

# Build the vocabulary with special tokens
vocab = build_vocab_from_iterator(yield_tokens(train_iter, special_tokens), specials=special_tokens)

# Set default index for unknown tokens
vocab.set_default_index(vocab["<unk>"])


In [3]:
def build_ngram_model(data, n=3):
    model = defaultdict(Counter)
    for sentence in data:
        for i in range(len(sentence)-n+1):
            context = tuple(sentence[i:i+n-1])
            target = sentence[i+n-1]
            model[context][target] += 1
    return model


In [4]:
import portalocker
with open("test.lock", "w") as lock_file:
    portalocker.lock(lock_file, portalocker.LOCK_EX)
    input("Lock acquired. Press Enter to release lock...")

import sys
print(sys.executable)


Lock acquired. Press Enter to release lock...
/usr/bin/python3


In [5]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)  # Ensure output layer matches vocab size

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        output = self.fc(output[:, -1, :])  # Assuming you want the last output for classification
        return output


In [6]:
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter

def load_data(data_type='train'):
    tokenizer = get_tokenizer('basic_english')
    counter = Counter()
    for label, line in IMDB(split=data_type):
        counter.update(tokenizer(line))
    # Convert the counter to a list of tokens
    tokenized_text = [tok for tok, cnt in counter.items() for _ in range(cnt)]
    return tokenized_text

# Now, using this function, you can load and tokenize your dataset:
tokenized_text = load_data('train')  # This should now correctly load and tokenize the text data


In [7]:
def build_ngram_model(tokenized_text, n=3):
    model = {}
    for i in range(len(tokenized_text)-n):
        gram = tuple(tokenized_text[i:i+n-1])
        next_word = tokenized_text[i+n-1]
        if gram not in model:
            model[gram] = {}
        if next_word not in model[gram]:
            model[gram][next_word] = 0
        model[gram][next_word] += 1
    # Convert counts to probabilities
    for gram in model.keys():
        total = float(sum(model[gram].values()))
        for word in model[gram]:
            model[gram][word] /= total
    return model


In [8]:
import random

def generate_text(model, start_text, num_words=20, n=3):
    result = start_text.split()
    for _ in range(num_words):
        state = tuple(result[-(n-1):]) # Last (n-1) words
        next_words = model.get(state, None)
        if not next_words:
            break
        next_word = random.choices(list(next_words.keys()), weights=next_words.values())[0]
        result.append(next_word)
    return ' '.join(result)


In [9]:
# Example usage (adapt as necessary)
tokenized_text = load_data('train')  # Load and tokenize your text data
ngram_model = build_ngram_model(tokenized_text, n=3)

# Generate 5 sample reviews
for _ in range(5):
    print(generate_text(ngram_model, "My favorite movie", num_words=20, n=3))


My favorite movie
My favorite movie
My favorite movie
My favorite movie
My favorite movie


In [10]:
from torch.utils.data import Dataset
import torch

class IMDBDataset(Dataset):
    def __init__(self, data_iter, vocab, tokenizer):
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.data = []
        for label, text in data_iter:
            # Ensure to handle unknown tokens, assuming vocab returns a special index for unknown tokens
            numericalized_text = [self.vocab.get('<start>', self.vocab.get('<unk>'))] + \
                                 [self.vocab.get(token, self.vocab.get('<unk>')) for token in self.tokenizer(text)] + \
                                 [self.vocab.get('<end>', self.vocab.get('<unk>'))]
            self.data.append((numericalized_text, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        numericalized_text, label = self.data[idx]
        input_sequence = torch.tensor(numericalized_text[:-1], dtype=torch.long)
        target_sequence = torch.tensor(numericalized_text[1:], dtype=torch.long)
        return input_sequence, target_sequence, label  # Including label for supervised training


In [11]:
embedding_dim = 100
hidden_dim = 256
vocab_size = len(vocab)  # Assuming vocab is already built as shown above

model = LSTMModel(vocab_size, embedding_dim, hidden_dim)


In [12]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
from torchtext.vocab import Vocab

# Initialize the tokenizer
tokenizer = get_tokenizer('basic_english')

# Function to yield tokens from the IMDB dataset
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Load the dataset iterator
train_iter = IMDB(split='train')

# Build the vocabulary from the iterator
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>', '<pad>', '<start>', '<end>'])

# Set special tokens and their indexes
vocab.set_default_index(vocab['<unk>'])  # Set default index for unknown tokens

# Assuming you now have a 'vocab' object properly set up, you can proceed with using it in your model.


In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim

# LSTMModel definition
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)  # Output layer maps to num_classes

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)  # No need to select the last timestep unless it's sequence classification
        return output

class TextDataset(Dataset):
    def __init__(self, input_sequences, targets):
        self.input_sequences = input_sequences
        self.targets = targets

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        return self.input_sequences[idx], self.targets[idx]

def collate_fn(batch):
    input_sequences, targets = zip(*batch)
    input_sequences_padded = pad_sequence([torch.tensor(seq) for seq in input_sequences], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([torch.tensor(tgt) for tgt in targets], batch_first=True, padding_value=0)
    return input_sequences_padded, targets_padded

vocab_size = 10000  # Example vocab size
num_classes = 20  # Example number of classes

input_sequences = torch.randint(0, vocab_size, (1000, 10))  # Adjust as necessary
targets = torch.randint(0, num_classes, (1000, 10))  # Adjust as necessary

dataset = TextDataset(input_sequences, targets)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = LSTMModel(vocab_size, embedding_dim=100, hidden_dim=256, num_classes=num_classes)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Assuming you have a list called 'loss_values' that stores the average loss for each epoch
loss_values = []

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_sequences, targets in train_loader:
        input_sequences, targets = input_sequences.long(), targets.long()

        optimizer.zero_grad()
        predictions = model(input_sequences)

        loss = criterion(predictions.view(-1, num_classes), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    loss_values.append(average_loss)
    print(f'Epoch {epoch+1}, Loss: {average_loss}')

  input_sequences_padded = pad_sequence([torch.tensor(seq) for seq in input_sequences], batch_first=True, padding_value=0)
  targets_padded = pad_sequence([torch.tensor(tgt) for tgt in targets], batch_first=True, padding_value=0)


Epoch 1, Loss: 2.998318374156952
Epoch 2, Loss: 2.962448127567768
Epoch 3, Loss: 2.9193638637661934
Epoch 4, Loss: 2.8279527500271797
Epoch 5, Loss: 2.69191175699234


In [15]:
import os

# Adjusted file path for Google Colab environment
file_path = '/content/bul.txt'
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    print("Dataset loaded successfully.")
else:
    print("File not found. Please check the file path.")


Dataset loaded successfully.


In [16]:
from gensim.models import KeyedVectors

In [17]:
glove_path = '/content/glove.6B.100d.txt'
import numpy as np

def load_glove_embeddings(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings(glove_path)
import torch
import torch.nn as nn

# Example vocab list - replace with your actual vocab
vocab = ['hello', 'world', '<unk>', '<pad>']  # Example vocabulary

vocab_size = len(vocab)
embedding_dim = 100  # Dimensionality of GloVe vectors you loaded
weights_matrix = np.zeros((vocab_size, embedding_dim))

# Create a weights matrix to initialize the embedding layer
for i, word in enumerate(vocab):
    try:
        weights_matrix[i] = glove_embeddings[word]
    except KeyError:
        # Initialize with a random vector if the word is not in GloVe
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))

# Create an embedding layer and load the GloVe weights
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedding_layer.weight.data.copy_(torch.from_numpy(weights_matrix))


tensor([[ 2.6688e-01,  3.9632e-01,  6.1690e-01, -7.7451e-01, -1.0390e-01,
          2.6697e-01,  2.7880e-01,  3.0992e-01,  5.4685e-03, -8.5256e-02,
          7.3602e-01, -9.8432e-02,  5.4790e-01, -3.0305e-02,  3.3479e-01,
          1.4094e-01, -7.0003e-03,  3.2569e-01,  2.2902e-01,  4.6557e-01,
         -1.9531e-01,  3.7491e-01, -7.1390e-01, -5.1775e-01,  7.7039e-01,
          1.0881e+00, -6.6011e-01, -1.6234e-01,  9.1190e-01,  2.1046e-01,
          4.7494e-02,  1.0019e+00,  1.1133e+00,  7.0094e-01, -8.6960e-02,
          4.7571e-01,  1.6360e-01, -4.4469e-01,  4.4690e-01, -9.3817e-01,
          1.3101e-02,  8.5964e-02, -6.7456e-01,  4.9662e-01, -3.7827e-02,
         -1.1038e-01, -2.8612e-01,  7.4606e-02, -3.1527e-01, -9.3774e-02,
         -5.7069e-01,  6.6865e-01,  4.5307e-01, -3.4154e-01, -7.1660e-01,
         -7.5273e-01,  7.5212e-02,  5.7903e-01, -1.1910e-01, -1.1379e-01,
         -1.0026e-01,  7.1341e-01, -1.1574e+00, -7.4026e-01,  4.0452e-01,
          1.8023e-01,  2.1449e-01,  3.