| import torch | |
| import torch.nn as nn | |
| from transformers import PreTrainedModel, PretrainedConfig | |
| class RobertaSentimentConfig(PretrainedConfig): | |
| model_type = "roberta-sentiment" | |
| def __init__(self, | |
| vocab_size=30000, | |
| hidden_size=512, | |
| num_attention_heads=8, | |
| num_hidden_layers=6, | |
| intermediate_size=2048, | |
| max_position_embeddings=128, | |
| num_labels=5, | |
| hidden_dropout_prob=0.1, | |
| **kwargs): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.num_attention_heads = num_attention_heads | |
| self.num_hidden_layers = num_hidden_layers | |
| self.intermediate_size = intermediate_size | |
| self.max_position_embeddings = max_position_embeddings | |
| self.num_labels = num_labels | |
| self.hidden_dropout_prob = hidden_dropout_prob | |
| class TransformerBlock(nn.Module): | |
| def __init__(self, hidden_dim, num_heads, ffn_dim, dropout): | |
| super().__init__() | |
| self.attn_norm = nn.LayerNorm(hidden_dim) | |
| self.ffn_norm = nn.LayerNorm(hidden_dim) | |
| self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True) | |
| self.ffn = nn.Sequential( | |
| nn.Linear(hidden_dim, ffn_dim), | |
| nn.GELU(), | |
| nn.Linear(ffn_dim, hidden_dim), | |
| nn.Dropout(dropout) | |
| ) | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, x, attention_mask): | |
| batch_size, seq_len, _ = x.size() | |
| x_norm = self.attn_norm(x) | |
| attn_mask = (1 - attention_mask).bool() | |
| attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=attn_mask) | |
| x = x + self.dropout(attn_out) | |
| x_norm = self.ffn_norm(x) | |
| x = x + self.dropout(self.ffn(x_norm)) | |
| return x | |
| class RobertaForSentimentClassification(PreTrainedModel): | |
| config_class = RobertaSentimentConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.token_emb = nn.Embedding(config.vocab_size, config.hidden_size) | |
| self.position_emb = nn.Embedding(config.max_position_embeddings, config.hidden_size) | |
| self.dropout = nn.Dropout(config.hidden_dropout_prob) | |
| self.layers = nn.ModuleList([ | |
| TransformerBlock(config.hidden_size, config.num_attention_heads, | |
| config.intermediate_size, config.hidden_dropout_prob) | |
| for _ in range(config.num_hidden_layers) | |
| ]) | |
| self.classifier = nn.Sequential( | |
| nn.Linear(config.hidden_size, config.hidden_size), | |
| nn.GELU(), | |
| nn.Dropout(config.hidden_dropout_prob), | |
| nn.Linear(config.hidden_size, config.num_labels) | |
| ) | |
| self.init_weights() | |
| def forward(self, input_ids, attention_mask): | |
| batch_size, seq_len = input_ids.size() | |
| positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len) | |
| x = self.token_emb(input_ids) + self.position_emb(positions) | |
| x = self.dropout(x) | |
| for layer in self.layers: | |
| x = layer(x, attention_mask) | |
| cls_token = x[:, 0] | |
| logits = self.classifier(cls_token) | |
| return {"logits": logits} | |