ML / AI Algorithms from Scratch
1 Attention
1.1 Scaled Dot-Product Attention

import torch
import torch.nn as nn
import torch.nn.functional as F
class ScaledDotProductAttention(nn.Module):
def __init__(self, d_model, d_k, d_v):
super().__init__()
self.d_model = d_model
self.d_k = d_k
self.q_proj = nn.Linear(d_model, d_k)
self.k_proj = nn.Linear(d_model, d_k)
self.v_proj = nn.Linear(d_model, d_v)
def forward(self, X, attn_mask=False):
"""
Input:
X: [batch_size, seq_len, d_model]
Immediate:
Q: [batch_size, seq_len, d_k]
K: [batch_size, seq_len, d_k]
V: [batch_size, seq_len, d_v]
attn_scores: [batch_size, seq_len, seq_len]
attn_mask: [batch_size, seq_len, seq_len]
attn_probs: [batch_size, seq_len, seq_len]
Output:
context: [batch_size, seq_len, d_v]
"""
Q = self.q_proj(X)
K = self.k_proj(X)
V = self.v_proj(X)
attn_scores = (Q @ K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
if attn_mask:
attn_mask = torch.triu(torch.ones_like(attn_scores) * float('-inf'), diagonal=1)
else:
attn_mask = torch.zeros_like(attn_scores)
attn_probs = F.softmax(attn_scores + attn_mask, dim=-1)
context = attn_probs @ V
return context
if __name__ == "__main__":
# config.py
batch_size = 2
seq_len = 10
d_model = 512
d_k = 64
d_v = 32
X = torch.randn(batch_size, seq_len, d_model)
SDPA = ScaledDotProductAttention(d_model, d_k, d_v)
context = SDPA(X, attn_mask=True)
print(f"context shape: {context.shape}") # [batch_size, seq_len, d_v]
In cross attention(Encoder-Decoder attention), src_seq_len_k
= src_seq_len_v
target_seq_len_q


1.2 Multi-Head Attention

import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_head, d_k, d_v, dropout=0.1):
super().__init__()
assert d_model % n_head == 0, "d_model must be divisible by n_head"
assert d_model % d_k == 0, "d_model must be divisible by d_k"
assert d_model % d_v == 0, "d_model must be divisible by d_v"
self.d_model = d_model
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.q_proj = nn.Linear(d_model, n_head * d_k)
self.k_proj = nn.Linear(d_model, n_head * d_k)
self.v_proj = nn.Linear(d_model, n_head * d_v)
self.dropout = nn.Dropout(dropout)
self.dense = nn.Linear(self.n_head * d_v, self.d_model)
def forward(self, X, mask=None):
batch_size = X.shape[0]
Q = self.q_proj(X).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
K = self.k_proj(X).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2)
V = self.v_proj(X).view(batch_size, -1, self.n_head, self.d_v).transpose(1, 2)
attn_scores = Q @ K.transpose(-2, -1) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
if mask is not None:
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
attn_probs = F.softmax(attn_scores, dim=-1)
attn_probs = self.dropout(attn_probs)
context = attn_probs @ V
outputs = context.view(batch_size, -1, self.n_head * self.d_v)
outputs = self.dense(outputs)
return outputs
if __name__ == '__main__':
# config.py
batch_size = 2
seq_len = 3
d_model = 512
n_head = 8
d_k = 5
d_v = 10
mask = None
dropout = 0.2
X = torch.randn(batch_size, seq_len, d_model)
MHA = MultiHeadAttention(d_model, n_head, d_k, d_v, dropout)
output = MHA(X, mask)
print(output.shape) # [batch_size, seq_len, d_model]
1.3 Grouped-Query Attention
GQA: Training Generalized Multi-Query Transformer Models from
Multi-Head Checkpoints

import torch
import torch.nn as nn
import torch.nn.functional as F
class GroupedQueryAttention(nn.Module):
def __init__(self, d_model, n_head, n_group, d_k, d_v, dropout=0.1):
super().__init__()
assert n_head % n_group == 0, "Number of heads must be divisible by the number of groups"
self.d_model = d_model
self.n_head = n_head
self.n_group = n_group
self.d_k = d_k
self.d_v = d_v
self.heads_per_group = n_head // n_group
# Linear projections for query, key, and value
self.q_proj = nn.Linear(d_model, n_head * d_k)
self.k_proj = nn.Linear(d_model, n_head * d_k)
self.v_proj = nn.Linear(d_model, n_head * d_v)
self.dropout = nn.Dropout(dropout)
# Final output projection
self.dense = nn.Linear(n_head * d_v, d_model)
def forward(self, X, mask=None):
batch_size = X.shape[0]
seq_len = X.shape[1]
# Project Q, K, V matrices
Q = self.q_proj(X).view(batch_size, seq_len, self.n_head, self.d_k).transpose(1, 2)
K = self.k_proj(X).view(batch_size, seq_len, self.n_head, self.d_k).transpose(1, 2)
V = self.v_proj(X).view(batch_size, seq_len, self.n_head, self.d_v).transpose(1, 2)
# Split heads into groups and process each group independently
Q = Q.view(batch_size, self.n_group, self.heads_per_group, seq_len, self.d_k)
K = K.view(batch_size, self.n_group, self.heads_per_group, seq_len, self.d_k)
V = V.view(batch_size, self.n_group, self.heads_per_group, seq_len, self.d_v)
outputs = []
for i in range(self.n_group):
Q_group = Q[:, i, :, :, :]
K_group = K[:, i, :, :, :]
V_group = V[:, i, :, :, :]
attn_scores = torch.matmul(Q_group, K_group.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
if mask is not None:
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
attn_probs = F.softmax(attn_scores, dim=-1)
attn_probs = self.dropout(attn_probs)
context_group = torch.matmul(attn_probs, V_group)
outputs.append(context_group)
# Concatenate the group outputs
context = torch.cat(outputs, dim=2)
# Reshape and apply final linear projection
context = context.view(batch_size, seq_len, -1)
outputs = self.dense(context)
return outputs
if __name__ == '__main__':
# config.py
batch_size = 2
seq_len = 3
d_model = 512
n_head = 8
n_group = 2 # Number of groups
d_k = 5
d_v = 10
mask = None
dropout = 0.2
X = torch.randn(batch_size, seq_len, d_model)
GQA = GroupedQueryAttention(d_model, n_head, n_group, d_k, d_v, dropout)
output = GQA(X, mask)
print(output.shape) # [batch_size, seq_len, d_model]
1.4 Multi-Query Attention

import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiQueryAttention(nn.Module):
def __init__(self, d_model, n_head, d_k, d_v, dropout=0.1):
super().__init__()
# Ensure d_model is divisible by n_head
assert d_model % n_head == 0, "d_model must be divisible by n_head"
self.d_model = d_model
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
# Single query projection
self.q_proj = nn.Linear(d_model, d_k) # Only one query head
# Multi-head key and value projections
self.k_proj = nn.Linear(d_model, n_head * d_k) # Multiple heads for K
self.v_proj = nn.Linear(d_model, n_head * d_v) # Multiple heads for V
self.dropout = nn.Dropout(dropout)
# Final output projection
self.dense = nn.Linear(n_head * d_v, d_model)
def forward(self, X, mask=None):
# Ensure input dimensions match expectations
assert X.dim() == 3, "Input tensor must have 3 dimensions [batch_size, seq_len, d_model]"
assert X.size(2) == self.d_model, f"Input feature size must be {self.d_model}"
batch_size = X.shape[0]
# Single query projection (no heads for Q)
Q = self.q_proj(X).unsqueeze(1) # Shape: [batch_size, 1, seq_len, d_k]
# Multi-head projections for key and value
K = self.k_proj(X).view(batch_size, -1, self.n_head, self.d_k).transpose(1, 2) # Shape: [batch_size, n_head, seq_len, d_k]
V = self.v_proj(X).view(batch_size, -1, self.n_head, self.d_v).transpose(1, 2) # Shape: [batch_size, n_head, seq_len, d_v]
# Ensure that the key and value shapes are correct
assert K.shape[-1] == self.d_k, f"Key projection size must be {self.d_k}"
assert V.shape[-1] == self.d_v, f"Value projection size must be {self.d_v}"
# Scaled dot-product attention
attn_scores = Q @ K.transpose(-2, -1) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32)) # Shape: [batch_size, 1, seq_len, seq_len]
# Apply mask (if any)
if mask is not None:
assert mask.dim() == 3, "Mask must have 3 dimensions [batch_size, seq_len, seq_len]"
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
# Attention probabilities
attn_probs = F.softmax(attn_scores, dim=-1)
attn_probs = self.dropout(attn_probs)
# Context (output of attention)
context = attn_probs @ V # Shape: [batch_size, 1, seq_len, d_v]
context = context.squeeze(1) # Remove the single query dimension
# Final linear projection
outputs = self.dense(context.view(batch_size, seq_len, -1)) # Shape: [batch_size, seq_len, d_model]
return outputs
if __name__ == '__main__':
# Configuration
batch_size = 2
seq_len = 3
d_model = 512
n_head = 8
d_k = 5
d_v = 10
mask = None
dropout = 0.2
X = torch.randn(batch_size, seq_len, d_model)
MQA = MultiQueryAttention(d_model, n_head, d_k, d_v, dropout)
output = MQA(X, mask)
print(output.shape) # [batch_size, seq_len, d_model]
2 CNN
2.1 2D Convolutional Layer
CS231n Convolutional Neural Networks for Visual Recognition | Stanford

Use bruce force to compute one element in one out channel in each timestamp
import numpy as np
class Conv2d():
def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0):
self.in_channels = in_channels
self.out_channels = out_channels
if isinstance(kernel_size, int):
self.kernel_height = self.kernel_width = kernel_size
elif isinstance(kernel_size, tuple):
self.kernel_height, self.kernel_width = kernel_size
else:
raise ValueError("kernel_size must be an int or a tuple")
self.stride = stride
self.padding = padding
self.filter = np.random.normal(0, 1.0, (out_channels, in_channels, self.kernel_height, self.kernel_width))
self.bias = np.zeros(out_channels)
def forward(self, X):
"""
Input:
X: [batch_size, in_channels, in_height, in_width]
Immediate:
window: [in_channels, kernel_height, kernel_width]
filter: [out_channels, in_channels, kernel_height, kernel_width]
Output:
Z: [batch_size, out_channels, out_height, out_width]
"""
batch_size, _, in_height, in_width = X.shape
X = np.pad(X, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant', constant_values=0)
out_height = (in_height + 2*self.padding - self.kernel_height) // self.stride + 1
out_width = (in_width + 2*self.padding - self.kernel_width) // self.stride + 1
Z = np.zeros([batch_size, self.out_channels, out_height, out_width])
for b in range(batch_size):
for c in range(self.out_channels):
for h in range(out_height):
for w in range(out_width):
window = X[b,:,h:h+self.kernel_height,w:w+self.kernel_width]
Z[b, c, h//self.stride, w//self.stride] = np.sum(window * self.filter[c,:,:,:]) + self.bias[c]
return Z
if __name__ == "__main__":
conv2d_layer = Conv2d(in_channels=3, out_channels=1, kernel_size=(3, 3), stride=1, padding=1)
input_image = np.random.rand(1, 3, 5, 5) # [batch_size, in_channels, input_height, input_width]
output = conv2d_layer.forward(input_image) # [batch_size, out_channels, output_height, output_width]
print(output.shape)
3 Normalization
3.1 Layer Norm & RMS Norm

class LayerNorm(nn.Module):
def __init__(self, embed_dim, eps=1e-5):
super().__init__()
self.eps = eps
self.gamma = nn.Parameter(torch.ones(embed_dim))
self.beta = nn.Parameter(torch.zeros(embed_dim))
def forward(self, X):
"""
Input:
X: [batch_size, seq_len, d_model]
Immediate:
mean: [batch_size, seq_len, 1]
var: [batch_size, seq_len, 1]
Output:
X_normalized: [batch_size, seq_len, d_model]
"""
mean = torch.mean(X, axis=-1, keepdims=True)
var = torch.var(X, axis=-1, keepdims=True)
X_normalized = (X - mean) / torch.sqrt(var + self.eps)
return X_normalized * self.gamma + self.beta
class RMSNorm(nn.Module):
def __init__(self, embed_dim, eps=1e-5):
super().__init__()
self.eps = eps
self.gamma = nn.Parameter(torch.ones(embed_dim))
def forward(self, X):
"""
Input:
X: [batch_size, seq_len, d_model]
Immediate:
rms: [batch_size, seq_len, 1]
Output:
X_normalized: [batch_size, seq_len, d_model]
"""
rms = torch.sqrt(torch.mean(X**2, axis=-1, keepdims=True))
X_normalized = X / (rms + self.eps)
return X_normalized * self.gamma
if __name__ == "__main__":
hidden_states_text = torch.randn(2, 100, 20) # [batch_size, seq_len, embed_dim]
layer_norm = LayerNorm(embed_dim=20)
rms_norm = RMSNorm(embed_dim=20)
hidden_states_text_normalized = rms_norm(hidden_states_text)
print(f"After RMS Norm: {hidden_states_text_normalized.shape}") # After RMS Norm: torch.Size([2, 100, 20])
3.2 Batch Norm

公式
class BatchNorm1d(nn.Module):
def __init__(self, num_features, eps=1e-5, momentum=0.1):
super().__init__()
self.eps = eps
self.gamma = nn.Parameter(torch.ones(num_features))
self.beta = nn.Parameter(torch.zeros(num_features))
self.momentum = momentum
self.running_mean = torch.zeros(num_features)
self.running_var = torch.ones(num_features)
def forward(self, X):
"""
Input:
X: [batch_size, num_channels, height * width]
Immediate:
mean: [1, num_channels, height * width]
var: [1, num_channels, height * width]
Output:
X_normalized: [batch_size, num_channels, height * width]
"""
if self.training:
mean = torch.mean(X, axis=0, keepdims=True)
var = torch.var(X, axis=0, keepdims=True)
self.running_mean = (self.momentum * self.running_mean + (1 - self.momentum) * mean.squeeze())
self.running_var = (self.momentum * self.running_var + (1 - self.momentum) * var.squeeze())
X_normalized = (X - mean) / torch.sqrt(var + self.eps)
else:
X_normalized = (X - self.running_mean) / torch.sqrt(self.running_var + self.eps)
return self.gamma * X_normalized + self.beta
if __name__ == "__main__":
hidden_states_img = torch.randn(2, 3, 9, 9).reshape(2, 3, -1) # [batch_size, channels, height * width]
batch_norm = BatchNorm1d(num_features=9*9)
hidden_states_img_normalized = batch_norm(hidden_states_img)
print(f"After Batch Norm: {hidden_states_img_normalized.shape}") # After Batch Norm: torch.Size([2, 3, 81])
4 Kmeans
import numpy as np
def kmeans(X, k, threshold = 1, max_iters=100):
# Randomly choose k data points as initial centroids
centroids = X[np.random.choice(len(X), k, replace=False)]
for i in range(max_iters):
# Assign clusters
distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
nearest_centroids = np.argmin(distances, axis=0)
new_centroids = np.array([X[nearest_centroids==j].mean(axis=0) for j in range(k)])
# Check for convergence
# Condition 1: centroids never change
if np.all(centroids == new_centroids):
break
# Condition 2: centroids is less than the threshold
center_change = np.linalg.norm(new_centroids - centroids)
if center_change < threshold:
break
centroids = new_centroids
return centroids, nearest_centroids
if __name__ == "__main__":
X = np.array([[1, 2], [1, 4], [1, 0],
[10, 2], [10, 4], [10, 0]])
centroids, labels = kmeans(X, k=2)
print("Centroids:\n", centroids)
print("Labels:", labels)
5 TF-IDF
TF-IDF is composed of two components:
- Term Frequency (TF): This measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (the total number of terms in the document) as a way of normalization:
- Inverse Document Frequency (IDF): This measures how important a term is. While computing TF, all terms are considered equally important. However, certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:
Here, is the total number of documents in the corpus.
- Combining TF and IDF:
This value is high when is frequent in a document but rare across all documents in . This signifies that is very descriptive or characteristic of .
import math
documents = [
"the sky is blue",
"the sun is bright",
"the sun in the sky is bright",
"we can see the shining sun, the bright sun"
]
tokenized_documents = [doc.lower().split() for doc in documents]
vocabulary = set(word for doc in tokenized_documents for word in doc)
def compute_tf(tokenized_doc, vocab):
# Count frequencies of terms in the document
term_count = {term: tokenized_doc.count(term) for term in vocab}
# Normalize by the length of the document
tf = {term: count / len(tokenized_doc) for term, count in term_count.items()}
return tf
def compute_idf(tokenized_docs, vocab):
# Count the number of documents containing each term
doc_containing_term = {term: sum(1 for doc in tokenized_docs if term in doc) for term in vocab}
# Calculate IDF, using log_e
idf = {term: math.log(len(tokenized_docs) / count) for term, count in doc_containing_term.items()}
return idf
# Calculate TF-IDF
def compute_tfidf(tf, idf):
tfidf = {term: tf_value * idf[term] for term, tf_value in tf.items()}
return tfidf
# Compute TF, IDF, and TF-IDF for each document
tfs = [compute_tf(doc, vocabulary) for doc in tokenized_documents]
idf = compute_idf(tokenized_documents, vocabulary)
tfidfs = [compute_tfidf(tf, idf) for tf in tfs]
# Output TF-IDF for each document
for i, tfidf in enumerate(tfidfs):
print(f"Document {i + 1} TF-IDF:")
for term, value in tfidf.items():
print(f" {term}: {value:.4f}")
1. BERT和GPT的区别?
BERT: 双向 自编码 Pre-trained Model + Fine-tuning(微调)
GPT: 单向 自回归 Pre-trained Model + Prompting(指示/提示)
应用上的差别
BERT
快手, 推荐算法岗
一面
- 自我介绍
- 对推荐系统的了解:背景,前景,架构,方法
- 项目拷打:特征构建,模型选择,评价指标
- 论文拷打:背景,模型,评价指标
- 八股(都是从项目和论文中找的点):SVM原理及其推导,LR原理及其推导,XGBoost原理及其推导,XGBoost处理缺失值的方法,模型过拟合的处理方法
- 手写:数组中前k个最小的数(类快排)
二面
- 自我介绍
- 手写:求x的平方根,但要求保留3位小数(先用以0.001为步长的方式ac了,然后让我用梯度下降和牛顿法来求解)
- 竞赛项目拷打
- 八股:贝叶斯超参数优化原理,几个boosting模型的特点和差异,欧氏距离和余弦距离的区别
三面
- 自我介绍
- 项目论文这些跟一面二面问的差不多
- 手写:把数组排列成最小的数
360, 推荐算法岗(实习)
怎么遍历树,深度优先要怎么写,广度优先怎么写 交叉熵原理
Q1: The difference between shallow copy and deep copy?
Python 直接赋值、浅拷贝和深度拷贝解析 | 菜鸟教程
import copy
original_list = [[1, 2, 3],[4, 5, 6]]
shallow_copy = copy.copy(original_list) # copy pointer
deep_copy = copy.deepcopy(original_list) # copy pointer and address
original_list[0][0] = 99
print("Original list:", original_list) # [[1, 2, 3],[4, 5, 6]]
print("Shallow copy:", shallow_copy) # [[1, 2, 3],[4, 5, 6]]
print("Deep copy:", deep_copy) # [[99, 2, 3],[4, 5, 6]]
6 Sort
6.1 Quick Sort

class Solution:
def quick_sort(self, nums: list[int], left: int, right: int):
"""快速排序"""
if left >= right: # 子数组长度为 1 时终止递归
return
pivot = self.partition(nums, left, right) # 哨兵划分
self.quick_sort(nums, left, pivot - 1) # 递归左子数组
self.quick_sort(nums, pivot + 1, right) # 递归右子数组
def partition(self, nums: list[int], left: int, right: int) -> int:
"""哨兵划分"""
i, j = left, right
pivot = nums[left] # 以 nums[left] 为基准数
while i < j:
while i < j and nums[j] >= pivot: # 从右向左找首个小于基准数的元素
j -= 1
while i < j and nums[i] <= pivot: # 从左向右找首个大于基准数的元素
i += 1
nums[i], nums[j] = nums[j], nums[i] # 元素交换
nums[i], nums[left] = nums[left], nums[i] # 将基准数交换至两子数组的分界线
return i
if __name__ == "__main__":
# 快速排序
nums = [2, 4, 1, 0, 3, 5]
k = 3
solution = Solution()
solution.quick_sort(nums, 0, len(nums) - 1)
print("快速排序完成后 nums =", nums)
LeetCode 215.KthLargest
Solution 1:
class Solution:
def findKthLargest(self, nums: List[int], k: int) -> int:
return sorted(nums)[len(nums)-k]
- Time Complexity : 其中 为数组元素数量。
- Space Complexity: 取决于内置排序算法的具体设计。
Solution 2:
class Solution:
def findKthLargest(self, nums, k):
def quick_select(nums, k):
pivot = random.choice(nums) # 随机选择基准数
# 将大于、小于、等于 pivot 的元素划分至 big, small, equal 中
big, equal, small = [], [], []
for num in nums:
if num > pivot:
big.append(num)
elif num < pivot:
small.append(num)
else:
equal.append(num)
# 找有序数组的第k大的数,从右向左数
if k <= len(big): # 第 k 大元素在 big 中,递归划分
return quick_select(big, k)
if len(nums) - len(small) < k: # 第 k 大元素在 small 中,递归划分
return quick_select(small, k - len(nums) + len(small))
return pivot # 第 k 大元素在 equal 中,直接返回 pivot
return quick_select(nums, k)
if __name__ == "__main__":
import random
nums = [2, 4, 1, 0, 3, 5]
k = 3
solution = Solution()
num = solution.findKthLargest(nums, k)
print(f"The kth largest number is {num}")
- Time Complexity : 其中 为数组元素数量。
- 对于长度为 的数组执行哨兵划分操作的时间复杂度为 。
- 每轮哨兵划分后,向下递归子数组的平均长度为 。
- 因此平均情况下,哨兵划分操作一共有 (等比数列求和), 复杂度为
- Space Complexity : The worst case is n-1 elements are divided into one of the lists(big, equal, small)
面试题: Smallest K Numbers
class Solution:
def smallestK(self, arr: List[int], k: int) -> List[int]:
if k == 0 or not arr:
return []
def quick_sort(arr, k):
if not arr:
return []
pivot = random.choice(arr)
big, small, equal = [], [], []
for num in arr:
if num > pivot:
big.append(num)
elif num < pivot:
small.append(num)
else:
equal.append(num)
# 找有序数组的第k小的数,从左向右数
if k <= len(small):
return quick_sort(small, k)
elif k <= len(small) + len(equal):
return small + equal[:k - len(small)]
else:
return small + equal + quick_sort(big, k - len(small) - len(equal))
return quick_sort(arr, k)
6.2 Merge Sort

观察发现,归并排序与二叉树后序遍历的递归顺序是一致的。
- 后序遍历:先递归左子树,再递归右子树,最后处理根节点。
- 归并排序:先递归左子数组,再递归右子数组,最后处理合并。
def merge_sort(nums: list[int], left: int, right: int):
"""
divide stage:
[9, 7, 5, 6, 4]
[9, 7, 5],[6, 4]
[9, 7],[5],[6, 4]
[9],[7],[5],[6, 4]
[9],[7],[5],[6],[4]
merge stage:
[7, 9],[5],[6],[4]
[5, 7, 9],[6],[4]
[5, 7, 9],[4, 6]
[4, 5, 6, 7, 9]
"""
# base case: when subarray length is 1, recursion terminates
if left >= right:
return
# divide stage
mid = (left + right) // 2 # calculate midpoint
merge_sort(nums, left, mid) # recursion over left subarray
merge_sort(nums, mid + 1, right) # recursion over right subarray
# merge stage
merge(nums, left, mid, right)
def merge(nums: list[int], left: int, mid: int, right: int):
tmp = [0] * (right - left + 1) # store the merged subarray for update
i, j = left, mid + 1 # The left subarray interval is [left, mid]; The right subarray interval is [mid+1, right]
k = 0 # for tmp array
while i <= mid and j <= right: # 当左右子数组都还有元素时,进行比较并将较小的元素复制到临时数组中
if nums[i] <= nums[j]:
tmp[k] = nums[i]
i += 1
else:
tmp[k] = nums[j]
j += 1
k += 1
while i <= mid: # 将左子数组和右子数组的剩余元素复制到临时数组中
tmp[k] = nums[i]
i += 1
k += 1
while j <= right:
tmp[k] = nums[j]
j += 1
k += 1
for k in range(0, len(tmp)): # 将临时数组 tmp 中的元素复制回原数组 nums 的对应区间
nums[left + k] = tmp[k]
if __name__=='__main__':
record = [9, 7, 5, 6, 4]
merge_sort(record, 0, len(record)-1)
print(record)
剑指Offer51.逆序对

7 Decode
7.1 Beam Search
import torch
import torch.nn.functional as F
def beam_search(LM_prob, beam_size = 3):
batch, seq_len, vocab_size = LM_prob.shape
log_LM_prob = LM_prob.log()
log_beam_prob, indices = log_LM_prob[:,0,:].topk(beam_size, sorted = True) # 先选择第0个位置的最大beam_size个token,log_emb_prob与indices的shape为(batch,beam)
indices = indices.unsqueeze(-1)
for i in range(1, seq_len): # 对每个长度进行beam search
log_beam_prob = log_beam_prob.unsqueeze(-1) + log_LM_prob[:,i,:].unsqueeze(1).repeat(1,beam_size,1) # log_beam_prob (batch,beam,vocab_size),每个beam的可能产生的概率
log_beam_prob, index = log_beam_prob.view(batch,-1).topk(beam_size, sorted = True)# 选择当前步概率最高的token
# 下面的计算:beam_id选出新beam来源于之前的哪个beam;index代表真实的token id
beam_id = index//vocab_size # beam_id,index (batch,beam)
index = index%vocab_size
mid = torch.Tensor([])
for j,bid,idx in zip(range(batch),beam_id,index): # 对batch内每个样本循环,选出beam的同时拼接上新生成的token id
x = torch.cat([indices[j][bid],idx.unsqueeze(-1)],-1)
mid = torch.cat([mid,x.unsqueeze(0)],0)
indices = mid
return indices,log_beam_prob
if __name__=='__main__':
LM_prob = F.softmax(torch.randn([32,20,1000]),dim = -1) # 建立一个语言模型 LM_prob (batch,seqlen,vocab_size)
indices,log_prob = beam_search(LM_prob,beam_size = 3) # 最终返回每个候选,以及每个候选的log_prob,shape为(batch,beam_size,seqlen)
print(indices)
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek “源神”启动!「GitHub 热点速览」
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 我与微信审核的“相爱相杀”看个人小程序副业
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)
· spring官宣接入deepseek,真的太香了~