ML / AI Algorithms from Scratch

1 Attention

1.1 Scaled Dot-Product Attention

import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_model, d_k, d_v):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_k

        self.q_proj = nn.Linear(d_model, d_k)
        self.k_proj = nn.Linear(d_model, d_k)
        self.v_proj = nn.Linear(d_model, d_v)

    def forward(self, X, attn_mask=False):
        """
        Input:
            X: [batch_size, seq_len, d_model]
        Immediate:
            Q: [batch_size, seq_len, d_k]
            K: [batch_size, seq_len, d_k]
            V: [batch_size, seq_len, d_v]
            attn_scores: [batch_size, seq_len, seq_len]
            attn_mask: [batch_size, seq_len, seq_len]
            attn_probs: [batch_size, seq_len, seq_len]
        Output:
            context: [batch_size, seq_len, d_v]
        """

        Q = self.q_proj(X)
        K = self.k_proj(X)
        V = self.v_proj(X)

        attn_scores = (Q @ K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))

        if attn_mask:
            attn_mask = torch.triu(torch.ones_like(attn_scores) * float('-inf'), diagonal=1)
        else:
            attn_mask = torch.zeros_like(attn_scores)
			
        attn_probs = F.softmax(attn_scores + attn_mask, dim=-1)

        context = attn_probs @ V

        return context


if __name__ == "__main__":
    # config.py
    batch_size = 2
    seq_len = 10
    d_model = 512
    d_k = 64
    d_v = 32

    X = torch.randn(batch_size, seq_len, d_model)

    SDPA = ScaledDotProductAttention(d_model, d_k, d_v)
    context = SDPA(X, attn_mask=True)

    print(f"context shape: {context.shape}") # [batch_size, seq_len, d_v]

In cross attention(Encoder-Decoder attention), src_seq_len_k = src_seq_len_v \(\neq\) target_seq_len_q


1.2 Multi-Head Attention

import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head, d_k, d_v, dropout=0.1):
        super().__init__()
        assert d_model % n_head == 0, "d_model must be divisible by n_head"
        assert d_model % d_k == 0, "d_model must be divisible by d_k"
        assert d_model % d_v == 0, "d_model must be divisible by d_v"

        self.d_model = d_model
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.q_proj = nn.Linear(d_model, n_head * d_k)
        self.k_proj = nn.Linear(d_model, n_head * d_k)
        self.v_proj = nn.Linear(d_model, n_head * d_v)
        self.dropout = nn.Dropout(dropout)

        self.dense = nn.Linear(self.n_head * d_v, self.d_model)

    def forward(self, X, mask=None):
        batch_size = X.shape[0]

        Q = self.q_proj(X).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
        K = self.k_proj(X).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
        V = self.v_proj(X).view(batch_size, -1, self.n_head, self.d_v).transpose(1, 2)

        attn_scores = Q @ K.transpose(-2, -1) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)

        context = attn_probs @ V

        outputs = context.view(batch_size, -1, self.n_head * self.d_v)
        outputs = self.dense(outputs)

        return outputs


if __name__ == '__main__':
    # config.py
    batch_size = 2
    seq_len = 3
    d_model = 512
    n_head = 8
    d_k = 5
    d_v = 10
    mask = None
    dropout = 0.2

    X = torch.randn(batch_size, seq_len, d_model)
    MHA = MultiHeadAttention(d_model, n_head, d_k, d_v, dropout)
    output = MHA(X, mask)
    print(output.shape) # [batch_size, seq_len, d_model]

1.3 Grouped-Query Attention

GQA: Training Generalized Multi-Query Transformer Models from
Multi-Head Checkpoints

import torch
import torch.nn as nn
import torch.nn.functional as F

class GroupedQueryAttention(nn.Module):
    def __init__(self, d_model, n_head, n_group, d_k, d_v, dropout=0.1):
        super().__init__()
        assert n_head % n_group == 0, "Number of heads must be divisible by the number of groups"

        self.d_model = d_model
        self.n_head = n_head
        self.n_group = n_group
        self.d_k = d_k
        self.d_v = d_v
        self.heads_per_group = n_head // n_group

        # Linear projections for query, key, and value
        self.q_proj = nn.Linear(d_model, n_head * d_k)
        self.k_proj = nn.Linear(d_model, n_head * d_k)
        self.v_proj = nn.Linear(d_model, n_head * d_v)
        self.dropout = nn.Dropout(dropout)

        # Final output projection
        self.dense = nn.Linear(n_head * d_v, d_model)

    def forward(self, X, mask=None):
        batch_size = X.shape[0]
        seq_len = X.shape[1]

        # Project Q, K, V matrices
        Q = self.q_proj(X).view(batch_size, seq_len, self.n_head, self.d_k).transpose(1, 2)
        K = self.k_proj(X).view(batch_size, seq_len, self.n_head, self.d_k).transpose(1, 2)
        V = self.v_proj(X).view(batch_size, seq_len, self.n_head, self.d_v).transpose(1, 2)

        # Split heads into groups and process each group independently
        Q = Q.view(batch_size, self.n_group, self.heads_per_group, seq_len, self.d_k)
        K = K.view(batch_size, self.n_group, self.heads_per_group, seq_len, self.d_k)
        V = V.view(batch_size, self.n_group, self.heads_per_group, seq_len, self.d_v)

        outputs = []
        for i in range(self.n_group):
            Q_group = Q[:, i, :, :, :]
            K_group = K[:, i, :, :, :]
            V_group = V[:, i, :, :, :]

            attn_scores = torch.matmul(Q_group, K_group.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))

            if mask is not None:
                attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

            attn_probs = F.softmax(attn_scores, dim=-1)
            attn_probs = self.dropout(attn_probs)

            context_group = torch.matmul(attn_probs, V_group)
            outputs.append(context_group)

        # Concatenate the group outputs
        context = torch.cat(outputs, dim=2)

        # Reshape and apply final linear projection
        context = context.view(batch_size, seq_len, -1)
        outputs = self.dense(context)

        return outputs

if __name__ == '__main__':
    # config.py
    batch_size = 2
    seq_len = 3
    d_model = 512
    n_head = 8
    n_group = 2  # Number of groups
    d_k = 5
    d_v = 10
    mask = None
    dropout = 0.2

    X = torch.randn(batch_size, seq_len, d_model)
    GQA = GroupedQueryAttention(d_model, n_head, n_group, d_k, d_v, dropout)
    output = GQA(X, mask)
    print(output.shape)  # [batch_size, seq_len, d_model]

1.4 Multi-Query Attention

import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiQueryAttention(nn.Module):
    def __init__(self, d_model, n_head, d_k, d_v, dropout=0.1):
        super().__init__()
        
        # Ensure d_model is divisible by n_head
        assert d_model % n_head == 0, "d_model must be divisible by n_head"
        
        self.d_model = d_model
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        # Single query projection
        self.q_proj = nn.Linear(d_model, d_k)  # Only one query head

        # Multi-head key and value projections
        self.k_proj = nn.Linear(d_model, n_head * d_k)  # Multiple heads for K
        self.v_proj = nn.Linear(d_model, n_head * d_v)  # Multiple heads for V

        self.dropout = nn.Dropout(dropout)
        
        # Final output projection
        self.dense = nn.Linear(n_head * d_v, d_model)

    def forward(self, X, mask=None):
        # Ensure input dimensions match expectations
        assert X.dim() == 3, "Input tensor must have 3 dimensions [batch_size, seq_len, d_model]"
        assert X.size(2) == self.d_model, f"Input feature size must be {self.d_model}"

        batch_size = X.shape[0]

        # Single query projection (no heads for Q)
        Q = self.q_proj(X).unsqueeze(1)  # Shape: [batch_size, 1, seq_len, d_k]

        # Multi-head projections for key and value
        K = self.k_proj(X).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)  # Shape: [batch_size, n_head, seq_len, d_k]
        V = self.v_proj(X).view(batch_size, -1, self.n_head, self.d_v).transpose(1, 2)  # Shape: [batch_size, n_head, seq_len, d_v]

        # Ensure that the key and value shapes are correct
        assert K.shape[-1] == self.d_k, f"Key projection size must be {self.d_k}"
        assert V.shape[-1] == self.d_v, f"Value projection size must be {self.d_v}"

        # Scaled dot-product attention
        attn_scores = Q @ K.transpose(-2, -1) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))  # Shape: [batch_size, 1, seq_len, seq_len]

        # Apply mask (if any)
        if mask is not None:
            assert mask.dim() == 3, "Mask must have 3 dimensions [batch_size, seq_len, seq_len]"
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Attention probabilities
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)

        # Context (output of attention)
        context = attn_probs @ V  # Shape: [batch_size, 1, seq_len, d_v]
        context = context.squeeze(1)  # Remove the single query dimension

        # Final linear projection
        outputs = self.dense(context.view(batch_size, seq_len, -1))  # Shape: [batch_size, seq_len, d_model]

        return outputs

if __name__ == '__main__':
    # Configuration
    batch_size = 2
    seq_len = 3
    d_model = 512
    n_head = 8
    d_k = 5
    d_v = 10
    mask = None
    dropout = 0.2

    X = torch.randn(batch_size, seq_len, d_model)
    MQA = MultiQueryAttention(d_model, n_head, d_k, d_v, dropout)
    output = MQA(X, mask)
    print(output.shape)  # [batch_size, seq_len, d_model]


2 CNN

2.1 2D Convolutional Layer

CS231n Convolutional Neural Networks for Visual Recognition | Stanford

$H_{out} = \dfrac{H_{in} + 2 \times padding - H_{kernel}}{stride} + 1$

\(W_{out} = \dfrac{W_{in} + 2 \times padding - W_{kernel}}{stride} + 1\)

Use bruce force to compute one element in one out channel in each timestamp

import numpy as np

class Conv2d():
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0):
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        if isinstance(kernel_size, int):
            self.kernel_height = self.kernel_width = kernel_size
        elif isinstance(kernel_size, tuple):
            self.kernel_height, self.kernel_width = kernel_size
        else:
            raise ValueError("kernel_size must be an int or a tuple")
        self.stride = stride
        self.padding = padding

        self.filter = np.random.normal(0, 1.0, (out_channels, in_channels, self.kernel_height, self.kernel_width))
        self.bias = np.zeros(out_channels)

    def forward(self, X):
        """
        Input:
            X: [batch_size, in_channels, in_height, in_width]
        Immediate:
            window: [in_channels, kernel_height, kernel_width]
            filter: [out_channels, in_channels, kernel_height, kernel_width]
        Output:
            Z: [batch_size, out_channels, out_height, out_width]
        """

        batch_size, _, in_height, in_width = X.shape
        X = np.pad(X, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant', constant_values=0)
        out_height = (in_height + 2*self.padding - self.kernel_height) // self.stride + 1
        out_width = (in_width + 2*self.padding - self.kernel_width) // self.stride + 1

        Z = np.zeros([batch_size, self.out_channels, out_height, out_width])

        for b in range(batch_size):
            for c in range(self.out_channels):
                for h in range(out_height):
                    for w in range(out_width):
                        window = X[b,:,h:h+self.kernel_height,w:w+self.kernel_width]
                        Z[b, c, h//self.stride, w//self.stride] = np.sum(window * self.filter[c,:,:,:]) + self.bias[c]

        return Z


if __name__ == "__main__":

    conv2d_layer = Conv2d(in_channels=3, out_channels=1, kernel_size=(3, 3), stride=1, padding=1)
    input_image = np.random.rand(1, 3, 5, 5)  # [batch_size, in_channels, input_height, input_width]
    output = conv2d_layer.forward(input_image) # [batch_size, out_channels, output_height, output_width]
    print(output.shape)

3 Normalization


3.1 Layer Norm & RMS Norm

class LayerNorm(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(embed_dim))
        self.beta = nn.Parameter(torch.zeros(embed_dim))
        
    def forward(self, X):
        """
        Input:
            X: [batch_size, seq_len, d_model]
        Immediate:
            mean: [batch_size, seq_len, 1]
            var: [batch_size, seq_len, 1]
        Output:
            X_normalized: [batch_size, seq_len, d_model]
        """
        mean = torch.mean(X, axis=-1, keepdims=True)
        var = torch.var(X, axis=-1, keepdims=True)
        X_normalized = (X - mean) / torch.sqrt(var + self.eps)

        return X_normalized * self.gamma + self.beta

class RMSNorm(nn.Module):
    def __init__(self, embed_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(embed_dim))

    def forward(self, X):
        """
        Input:
            X: [batch_size, seq_len, d_model]
        Immediate:
            rms: [batch_size, seq_len, 1]
        Output:
            X_normalized: [batch_size, seq_len, d_model]
        """
        rms = torch.sqrt(torch.mean(X**2, axis=-1, keepdims=True))
        X_normalized = X / (rms + self.eps)

        return X_normalized * self.gamma


if __name__ == "__main__":

    hidden_states_text = torch.randn(2, 100, 20) # [batch_size, seq_len, embed_dim]
	
    layer_norm = LayerNorm(embed_dim=20)
    rms_norm = RMSNorm(embed_dim=20)
	hidden_states_text_normalized = rms_norm(hidden_states_text)

    print(f"After RMS Norm: {hidden_states_text_normalized.shape}") # After RMS Norm: torch.Size([2, 100, 20])

3.2 Batch Norm

class BatchNorm1d(nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(num_features))
        self.beta = nn.Parameter(torch.zeros(num_features))

        self.momentum = momentum        
        self.running_mean = torch.zeros(num_features)
        self.running_var = torch.ones(num_features)

    def forward(self, X):
        """
        Input:
            X: [batch_size, num_channels, height * width]
        Immediate:
            mean: [1, num_channels, height * width]
            var: [1, num_channels, height * width]
        Output:
            X_normalized: [batch_size, num_channels, height * width]
        """
        if self.training:
            mean = torch.mean(X, axis=0, keepdims=True)
            var = torch.var(X, axis=0, keepdims=True)
            self.running_mean = (self.momentum * self.running_mean + (1 - self.momentum) * mean.squeeze())
            self.running_var = (self.momentum * self.running_var + (1 - self.momentum) * var.squeeze())            
            X_normalized = (X - mean) / torch.sqrt(var + self.eps)
        else:
            X_normalized = (X - self.running_mean) / torch.sqrt(self.running_var + self.eps)
        
        return self.gamma * X_normalized + self.beta


if __name__ == "__main__":

    hidden_states_img = torch.randn(2, 3, 9, 9).reshape(2, 3, -1) # [batch_size, channels, height * width]

    batch_norm = BatchNorm1d(num_features=9*9)
    hidden_states_img_normalized = batch_norm(hidden_states_img)
	
    print(f"After Batch Norm: {hidden_states_img_normalized.shape}") # After Batch Norm: torch.Size([2, 3, 81])

4 Kmeans

import numpy as np

def kmeans(X, k, threshold = 1, max_iters=100):
    # Randomly choose k data points as initial centroids
    centroids = X[np.random.choice(len(X), k, replace=False)]

    for i in range(max_iters):
        # Assign clusters
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        nearest_centroids = np.argmin(distances, axis=0)

        new_centroids = np.array([X[nearest_centroids==j].mean(axis=0) for j in range(k)])

        # Check for convergence
        # Condition 1: centroids never change
        if np.all(centroids == new_centroids):
            break
        # Condition 2: centroids is less than the threshold
        center_change = np.linalg.norm(new_centroids - centroids)
        if center_change < threshold:
            break
        centroids = new_centroids

    return centroids, nearest_centroids


if __name__ == "__main__":
    
    X = np.array([[1, 2], [1, 4], [1, 0],
                [10, 2], [10, 4], [10, 0]])

    centroids, labels = kmeans(X, k=2)
    print("Centroids:\n", centroids)
    print("Labels:", labels)

5 TF-IDF

TF-IDF is composed of two components:

  1. Term Frequency (TF): This measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (the total number of terms in the document) as a way of normalization:
$TF(t,d)$ = $\dfrac{\text{Number of times term} ~ t ~ \text{appears in document} ~ d}{\text{Total number of terms in document} ~ d}$
  1. Inverse Document Frequency (IDF): This measures how important a term is. While computing TF, all terms are considered equally important. However, certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:
$IDF(t,D)$ = $\log\left(\dfrac{\text{Total number of documents} ~ D}{\text{Number of documents with term} ~ t ~ \text{in them}}\right)$

$\quad ~~ $ Here, \(D\) is the total number of documents in the corpus.

  1. Combining TF and IDF:
$TFIDF(t,d,D) = TF(t,d) \times IDF(t,D)$

$\quad ~~ $ This value is high when \(t\) is frequent in a document \(d\) but rare across all documents in \(D\). This signifies that \(t\) is very descriptive or characteristic of \(d\).

import math

documents = [
    "the sky is blue",
    "the sun is bright",
    "the sun in the sky is bright",
    "we can see the shining sun, the bright sun"
]

tokenized_documents = [doc.lower().split() for doc in documents]
vocabulary = set(word for doc in tokenized_documents for word in doc)

def compute_tf(tokenized_doc, vocab):
    # Count frequencies of terms in the document
    term_count = {term: tokenized_doc.count(term) for term in vocab}
    # Normalize by the length of the document
    tf = {term: count / len(tokenized_doc) for term, count in term_count.items()}
    return tf

def compute_idf(tokenized_docs, vocab):
    # Count the number of documents containing each term
    doc_containing_term = {term: sum(1 for doc in tokenized_docs if term in doc) for term in vocab}
    # Calculate IDF, using log_e
    idf = {term: math.log(len(tokenized_docs) / count) for term, count in doc_containing_term.items()}
    return idf

# Calculate TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {term: tf_value * idf[term] for term, tf_value in tf.items()}
    return tfidf

# Compute TF, IDF, and TF-IDF for each document
tfs = [compute_tf(doc, vocabulary) for doc in tokenized_documents]
idf = compute_idf(tokenized_documents, vocabulary)

tfidfs = [compute_tfidf(tf, idf) for tf in tfs]

# Output TF-IDF for each document
for i, tfidf in enumerate(tfidfs):
    print(f"Document {i + 1} TF-IDF:")
    for term, value in tfidf.items():
        print(f"  {term}: {value:.4f}")

1. BERT和GPT的区别?
BERT: 双向 自编码 Pre-trained Model + Fine-tuning(微调)

GPT: 单向 自回归 Pre-trained Model + Prompting(指示/提示)

应用上的差别

BERT

快手, 推荐算法岗

一面

  • 自我介绍
  • 对推荐系统的了解:背景,前景,架构,方法
  • 项目拷打:特征构建,模型选择,评价指标
  • 论文拷打:背景,模型,评价指标
  • 八股(都是从项目和论文中找的点):SVM原理及其推导,LR原理及其推导,XGBoost原理及其推导,XGBoost处理缺失值的方法,模型过拟合的处理方法
  • 手写:数组中前k个最小的数(类快排)

二面

  • 自我介绍
  • 手写:求x的平方根,但要求保留3位小数(先用以0.001为步长的方式ac了,然后让我用梯度下降和牛顿法来求解)
  • 竞赛项目拷打
  • 八股:贝叶斯超参数优化原理,几个boosting模型的特点和差异,欧氏距离和余弦距离的区别

三面

360, 推荐算法岗(实习)

怎么遍历树,深度优先要怎么写,广度优先怎么写 交叉熵原理

Q1: The difference between shallow copy and deep copy?
Python 直接赋值、浅拷贝和深度拷贝解析 | 菜鸟教程

import copy

original_list = [[1, 2, 3],[4, 5, 6]]
shallow_copy = copy.copy(original_list) # copy pointer
deep_copy = copy.deepcopy(original_list) # copy pointer and address

original_list[0][0] = 99

print("Original list:", original_list) # [[1, 2, 3],[4, 5, 6]]
print("Shallow copy:", shallow_copy)   # [[1, 2, 3],[4, 5, 6]]
print("Deep copy:", deep_copy)         # [[99, 2, 3],[4, 5, 6]]

6 Sort

6.1 Quick Sort

class Solution:
    def quick_sort(self, nums: list[int], left: int, right: int):
        """快速排序"""
        if left >= right:                          # 子数组长度为 1 时终止递归
            return
        pivot = self.partition(nums, left, right)  # 哨兵划分
        self.quick_sort(nums, left, pivot - 1)     # 递归左子数组
        self.quick_sort(nums, pivot + 1, right)    # 递归右子数组

    def partition(self, nums: list[int], left: int, right: int) -> int:
        """哨兵划分"""
        i, j = left, right
        pivot = nums[left]                         # 以 nums[left] 为基准数
        while i < j:
            while i < j and nums[j] >= pivot:      # 从右向左找首个小于基准数的元素
                j -= 1
            while i < j and nums[i] <= pivot:      # 从左向右找首个大于基准数的元素
                i += 1
            nums[i], nums[j] = nums[j], nums[i]    # 元素交换
        nums[i], nums[left] = nums[left], nums[i]  # 将基准数交换至两子数组的分界线
        return i


if __name__ == "__main__":
    # 快速排序
    nums = [2, 4, 1, 0, 3, 5]
    k = 3

    solution = Solution()
    solution.quick_sort(nums, 0, len(nums) - 1)
    print("快速排序完成后 nums =", nums)

LeetCode 215.KthLargest

Solution 1:

class Solution:
    def findKthLargest(self, nums: List[int], k: int) -> int:
        return sorted(nums)[len(nums)-k]
  • Time Complexity \(O(NlogN)\): 其中 \(N\) 为数组元素数量。
  • Space Complexity: 取决于内置排序算法的具体设计。

Solution 2:

class Solution:
    def findKthLargest(self, nums, k):
        def quick_select(nums, k):
            pivot = random.choice(nums)     # 随机选择基准数
            big, equal, small = [], [], []  # 将大于、小于、等于 pivot 的元素划分至 big, small, equal 中
            for num in nums:
                if num > pivot:
                    big.append(num)
                elif num < pivot:
                    small.append(num)
                else:
                    equal.append(num)
            if k <= len(big):               # 第 k 大元素在 big 中,递归划分
                return quick_select(big, k)
            if len(nums) - len(small) < k:  # 第 k 大元素在 small 中,递归划分
                return quick_select(small, k - len(nums) + len(small))
            return pivot                    # 第 k 大元素在 equal 中,直接返回 pivot
        return quick_select(nums, k)


if __name__ == "__main__":
    import random
    nums = [2, 4, 1, 0, 3, 5]
    k = 3

    solution = Solution()
    num = solution.findKthLargest(nums, k)
    print(f"The kth largest number is {num}")
  • Time Complexity \(O(N)\): 其中 \(N\) 为数组元素数量。
    • 对于长度为 \(N\) 的数组执行哨兵划分操作的时间复杂度为 \(N\)
    • 每轮哨兵划分后,向下递归子数组的平均长度为 \(\frac{N}{2}\)
    • 因此平均情况下,哨兵划分操作一共有 \(N+\frac{N}{2}+\frac{N}{4}+...+\frac{N}{N}=\frac{N−\frac{1}{2}}{1−\frac{1}{2}}=2N−1N + \frac{N}{2} + \frac{N}{4} + ... + \frac{N}{N} = \frac{N - \frac{1}{2}}{1 - \frac{1}{2}} = 2N - 1\)(等比数列求和), 复杂度为 \(O(N)\)
  • Space Complexity \(O(N)\): The worst case is n-1 elements are divided into one of the lists(big, equal, small)

6.2 Merge Sort

观察发现,归并排序与二叉树后序遍历的递归顺序是一致的。

  • 后序遍历:先递归左子树,再递归右子树,最后处理根节点。
  • 归并排序:先递归左子数组,再递归右子数组,最后处理合并。
def merge_sort(nums: list[int], left: int, right: int):
    """
    divide stage:
    [9, 7, 5, 6, 4]
    [9, 7, 5],[6, 4]
    [9, 7],[5],[6, 4]
    [9],[7],[5],[6, 4]
    [9],[7],[5],[6],[4]
    merge stage:
    [7, 9],[5],[6],[4]
    [5, 7, 9],[6],[4]
    [5, 7, 9],[4, 6]
    [4, 5, 6, 7, 9]
    """
    # base case: when subarray length is 1, recursion terminates
    if left >= right:
        return

    # divide stage
    mid = (left + right) // 2         # calculate midpoint
    merge_sort(nums, left, mid)       # recursion over left subarray
    merge_sort(nums, mid + 1, right)  # recursion over right subarray

    # merge stage
    merge(nums, left, mid, right)

def merge(nums: list[int], left: int, mid: int, right: int):

    tmp = [0] * (right - left + 1)    # store the merged subarray for update
    i, j = left, mid + 1              # The left subarray interval is [left, mid]; The right subarray interval is [mid+1, right]
    k = 0                             # for tmp array

    while i <= mid and j <= right:    # 当左右子数组都还有元素时,进行比较并将较小的元素复制到临时数组中
        if nums[i] <= nums[j]:
            tmp[k] = nums[i]
            i += 1
        else:
            tmp[k] = nums[j]
            j += 1
        k += 1

    while i <= mid:                   # 将左子数组和右子数组的剩余元素复制到临时数组中
        tmp[k] = nums[i]
        i += 1
        k += 1
    while j <= right:
        tmp[k] = nums[j]
        j += 1
        k += 1

    for k in range(0, len(tmp)):      # 将临时数组 tmp 中的元素复制回原数组 nums 的对应区间
        nums[left + k] = tmp[k]


if  __name__=='__main__':
    record = [9, 7, 5, 6, 4]

    merge_sort(record, 0, len(record)-1)
    print(record)

剑指Offer51.逆序对

https://leetcode.cn/problems/shu-zu-zhong-de-ni-xu-dui-lcof/solutions/622496/jian-zhi-offer-51-shu-zu-zhong-de-ni-xu-pvn2h/


7 Decode

import torch
import torch.nn.functional as F

def beam_search(LM_prob, beam_size = 3):
    batch, seq_len, vocab_size = LM_prob.shape

    log_LM_prob = LM_prob.log()
    log_beam_prob, indices = log_LM_prob[:,0,:].topk(beam_size, sorted = True) # 先选择第0个位置的最大beam_size个token,log_emb_prob与indices的shape为(batch,beam)
    indices = indices.unsqueeze(-1)

    for i in range(1, seq_len): # 对每个长度进行beam search

        log_beam_prob = log_beam_prob.unsqueeze(-1) + log_LM_prob[:,i,:].unsqueeze(1).repeat(1,beam_size,1) # log_beam_prob (batch,beam,vocab_size),每个beam的可能产生的概率
        log_beam_prob, index = log_beam_prob.view(batch,-1).topk(beam_size, sorted = True)# 选择当前步概率最高的token
        # 下面的计算:beam_id选出新beam来源于之前的哪个beam;index代表真实的token id
        beam_id = index//vocab_size # beam_id,index (batch,beam)
        index = index%vocab_size
        mid = torch.Tensor([])

        for j,bid,idx in zip(range(batch),beam_id,index): # 对batch内每个样本循环,选出beam的同时拼接上新生成的token id
            x = torch.cat([indices[j][bid],idx.unsqueeze(-1)],-1)
            mid = torch.cat([mid,x.unsqueeze(0)],0)
        indices = mid
    return indices,log_beam_prob


if __name__=='__main__':

    LM_prob = F.softmax(torch.randn([32,20,1000]),dim = -1) # 建立一个语言模型 LM_prob (batch,seqlen,vocab_size)
    indices,log_prob = beam_search(LM_prob,beam_size = 3) # 最终返回每个候选,以及每个候选的log_prob,shape为(batch,beam_size,seqlen)
    print(indices)

Reference

11.5 快速排序 - Hello 算法
11.6 归并排序
2.4 空间复杂度

posted @ 2024-01-08 01:33  ForHHeart  阅读(13)  评论(0编辑  收藏  举报