设计推荐系统
问题
如何用 Python 设计一个推荐系统?协同过滤和基于内容的推荐有什么区别?
答案
架构
协同过滤
collaborative_filtering.py
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class UserBasedCF:
"""基于用户的协同过滤"""
def __init__(self, ratings: np.ndarray):
# ratings: user x item 评分矩阵
self.ratings = ratings
# 计算用户相似度矩阵
self.user_sim = cosine_similarity(ratings)
def recommend(self, user_id: int, top_k: int = 10) -> list[int]:
# 找出最相似的用户
sim_scores = self.user_sim[user_id]
similar_users = np.argsort(sim_scores)[::-1][1:50] # 排除自己
# 加权预测评分
user_rated = set(np.where(self.ratings[user_id] > 0)[0])
scores = {}
for item in range(self.ratings.shape[1]):
if item in user_rated:
continue # 跳过已评分的
weighted_sum = sum(
sim_scores[u] * self.ratings[u, item]
for u in similar_users if self.ratings[u, item] > 0
)
scores[item] = weighted_sum
return sorted(scores, key=scores.get, reverse=True)[:top_k]
class ItemBasedCF:
"""基于物品的协同过滤"""
def __init__(self, ratings: np.ndarray):
self.ratings = ratings
# 计算物品相似度(转置后按列计算)
self.item_sim = cosine_similarity(ratings.T)
def recommend(self, user_id: int, top_k: int = 10) -> list[int]:
user_ratings = self.ratings[user_id]
rated_items = np.where(user_ratings > 0)[0]
scores = {}
for item in range(self.ratings.shape[1]):
if user_ratings[item] > 0:
continue
score = sum(
self.item_sim[item, j] * user_ratings[j] for j in rated_items
)
scores[item] = score
return sorted(scores, key=scores.get, reverse=True)[:top_k]
基于内容推荐
content_based.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class ContentBasedRecommender:
def __init__(self, items: list[dict]):
self.items = items
# TF-IDF 提取文本特征
corpus = [f"{item['title']} {item['tags']}" for item in items]
self.tfidf = TfidfVectorizer(max_features=5000)
self.item_vectors = self.tfidf.fit_transform(corpus)
def recommend(self, item_id: int, top_k: int = 10) -> list[int]:
"""根据物品找相似物品"""
target = self.item_vectors[item_id]
similarities = cosine_similarity(target, self.item_vectors).flatten()
# 排除自身
similar_indices = np.argsort(similarities)[::-1][1:top_k + 1]
return similar_indices.tolist()
深度学习排序
ranking_model.py
import torch
import torch.nn as nn
class DeepFM(nn.Module):
"""DeepFM 排序模型"""
def __init__(self, field_dims: list[int], embed_dim: int = 8):
super().__init__()
self.embedding = nn.ModuleList([
nn.Embedding(dim, embed_dim) for dim in field_dims
])
# FM 一阶
self.linear = nn.ModuleList([
nn.Embedding(dim, 1) for dim in field_dims
])
# DNN 部分
total_dim = len(field_dims) * embed_dim
self.dnn = nn.Sequential(
nn.Linear(total_dim, 128), nn.ReLU(), nn.Dropout(0.3),
nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.3),
nn.Linear(64, 1),
)
def forward(self, x):
# x: (batch, num_fields) 各字段索引
embeds = [self.embedding[i](x[:, i]) for i in range(x.shape[1])]
embed_concat = torch.cat(embeds, dim=-1) # DNN 输入
linear_out = sum(self.linear[i](x[:, i]).squeeze() for i in range(x.shape[1]))
# FM 交叉
embed_stack = torch.stack(embeds, dim=1) # (batch, fields, dim)
sum_square = embed_stack.sum(dim=1).pow(2).sum(dim=-1)
square_sum = embed_stack.pow(2).sum(dim=1).sum(dim=-1)
fm_out = 0.5 * (sum_square - square_sum)
dnn_out = self.dnn(embed_concat).squeeze()
return torch.sigmoid(linear_out + fm_out + dnn_out)
常见面试问题
Q1: User-Based CF vs Item-Based CF?
答案:
| 维度 | User-Based | Item-Based |
|---|---|---|
| 思路 | 相似用户喜欢的 | 喜欢物品的相似物品 |
| 实时性 | 差(用户行为多变) | 好(物品关系稳定) |
| 适用场景 | 用户少、物品多 | 物品少、用户多(电商) |
| 冷启动 | 新用户无法推荐 | 新物品无法推荐 |
Q2: 如何解决冷启动问题?
答案:
- 新用户:热门推荐 → 引导兴趣标签 → 基于内容推荐
- 新物品:基于内容特征找相似已有物品
- 系统冷启动:初期用基于规则/编辑推荐,积累数据后再切换
Q3: 召回和排序的区别?
答案:
- 召回:从全量候选(百万级)中快速筛出千级候选集,可以多路召回
- 排序:用复杂模型对召回结果精排,输出最终推荐列表
- 重排:业务规则调整(去重、多样性、运营干预)