Note
Go to the end to download the full example code. or to run this example in your browser via Binder
维基百科主特征向量#
一种评估图中顶点相对重要性的经典方法是计算邻接矩阵的主特征向量,从而将第一个特征向量的分量值分配给每个顶点作为中心性得分:
在网页和链接的图中,这些值被谷歌称为PageRank得分。
本示例的目标是分析维基百科文章内部链接的图,以根据该特征向量中心性对文章的重要性进行排名。
计算主特征向量的传统方法是使用幂迭代法:
这里的计算是通过scikit-learn中实现的Martinsson的随机SVD算法完成的。
图数据是从DBpedia dumps中获取的。DBpedia是对维基百科内容中潜在结构化数据的提取。
# 作者:scikit-learn 开发者
# SPDX-License-Identifier: BSD-3-Clause
import os
from bz2 import BZ2File
from datetime import datetime
from pprint import pprint
from time import time
from urllib.request import urlopen
import numpy as np
from scipy import sparse
from sklearn.decomposition import randomized_svd
下载数据(如果尚未存储在磁盘上)
redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"
redirects_filename = redirects_url.rsplit("/", 1)[1]
page_links_url = "http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2"
page_links_filename = page_links_url.rsplit("/", 1)[1]
resources = [
(redirects_url, redirects_filename),
(page_links_url, page_links_filename),
]
for url, filename in resources:
if not os.path.exists(filename):
print("Downloading data from '%s', please wait..." % url)
opener = urlopen(url)
with open(filename, "wb") as f:
f.write(opener.read())
print()
加载重定向文件#
def index(redirects, index_map, k):
"""找到重定向解析后文章名称的索引"""
k = redirects.get(k, k)
return index_map.setdefault(k, len(index_map))
DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/")
SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)
def short_name(nt_uri):
"""移除 < 和 > URI 标记以及通用 URI 前缀"""
return nt_uri[SHORTNAME_SLICE]
def get_redirects(redirects_filename):
"""解析重定向并从中构建一个传递闭合的映射"""
redirects = {}
print("Parsing the NT redirect file")
for l, line in enumerate(BZ2File(redirects_filename)):
split = line.split()
if len(split) != 4:
print("ignoring malformed line: " + line)
continue
redirects[short_name(split[0])] = short_name(split[2])
if l % 1000000 == 0:
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
# 计算传递闭包
#
#
print("Computing the transitive closure of the redirect relation")
for l, source in enumerate(redirects.keys()):
transitive_target = None
target = redirects[source]
seen = {source}
while True:
transitive_target = target
target = redirects.get(target)
if target is None or target in seen:
break
seen.add(target)
redirects[source] = transitive_target
if l % 1000000 == 0:
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
return redirects
计算邻接矩阵#
def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):
"""提取邻接图为一个scipy稀疏矩阵
首先解析重定向。
返回X,即scipy稀疏邻接矩阵,重定向为从文章名称到文章名称的Python字典,以及index_map,一个从文章名称到Python整数(文章索引)的字典。
"""
print("Computing the redirect map")
redirects = get_redirects(redirects_filename)
print("Computing the integer index map")
index_map = dict()
links = list()
for l, line in enumerate(BZ2File(page_links_filename)):
split = line.split()
if len(split) != 4:
print("ignoring malformed line: " + line)
continue
i = index(redirects, index_map, short_name(split[0]))
j = index(redirects, index_map, short_name(split[2]))
links.append((i, j))
if l % 1000000 == 0:
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
if limit is not None and l >= limit - 1:
break
print("Computing the adjacency matrix")
X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)
for i, j in links:
X[i, j] = 1.0
del links
print("Converting to CSR representation")
X = X.tocsr()
print("CSR conversion done")
return X, redirects, index_map
# 在处理500万条链接后停止,以便能够在内存中工作
X, redirects, index_map = get_adjacency_matrix(
redirects_filename, page_links_filename, limit=5000000
)
names = {i: name for name, i in index_map.items()}
计算主奇异向量使用随机化SVD#
print("Computing the principal singular vectors using randomized_svd")
t0 = time()
U, s, V = randomized_svd(X, 5, n_iter=3)
print("done in %0.3fs" % (time() - t0))
# 打印与主奇异向量相关的维基百科最强连通分量的名称,这些连通分量应类似于最高特征向量。
print("Top wikipedia pages according to principal singular vectors")
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])
计算中心性得分#
def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
"""幂迭代计算主特征向量
此方法也被称为Google PageRank,其实现基于NetworkX项目(也是BSD许可)的实现,版权归以下人员所有:
Aric Hagberg <hagberg@lanl.gov>
Dan Schult <dschult@colgate.edu>
Pieter Swart <swart@lanl.gov>
"""
n = X.shape[0]
X = X.copy()
incoming_counts = np.asarray(X.sum(axis=1)).ravel()
print("Normalizing the graph")
for i in incoming_counts.nonzero()[0]:
X.data[X.indptr[i] : X.indptr[i + 1]] *= 1.0 / incoming_counts[i]
dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0), 1.0 / n, 0)).ravel()
scores = np.full(n, 1.0 / n, dtype=np.float32) # initial guess
for i in range(max_iter):
print("power iteration #%d" % i)
prev_scores = scores
scores = (
alpha * (scores * X + np.dot(dangle, prev_scores))
+ (1 - alpha) * prev_scores.sum() / n
)
# 检查收敛性:归一化的 l_无穷范数
scores_max = np.abs(scores).max()
if scores_max == 0.0:
scores_max = 1.0
err = np.abs(scores - prev_scores).max() / scores_max
print("error: %0.6f" % err)
if err < n * tol:
return scores
return scores
print("Computing principal eigenvector score using a power iteration method")
t0 = time()
scores = centrality_scores(X, max_iter=100)
print("done in %0.3fs" % (time() - t0))
pprint([names[i] for i in np.abs(scores).argsort()[-10:]])
Related examples
压缩感知:具有L1先验(Lasso)的断层扫描重建
使用KBinsDiscretizer离散连续特征
具有异构数据源的列转换器
层次聚类:结构化 vs 非结构化 Ward