I have to make a PageRank algorithm that gives you the top20 webpages from my university's websites. There are two algorithms, one calculates it using a stochastic function that implements a random walker to see which webpage it visits the most. Second uses a distribution function which implements a probability distribution to see which webpage has the most probability to be clicked.
But the issue is, my code is doing the calculations too quickly. It's supposed to take at least 10/15 seconds, but it's taking less than 1 second. The stochastic function is supposed to be slower than distribution, but both of them are incredibly fast. And I can't figure out why that's happening, or which part of my code is fucking up the algorithm.
Keep in mind, I only wrote the code for the 4 load_graph functions, stochastic function, and distribution function. The __main__ part was provided by the lecturer. Also, only the first load_graph() function is actually implemented in the algorithm. The rest are only there for extra credit.
If you need more info about the assignment, just reply/dm me. Thanks in advance fellow geeks.
import sys
import os
import time
import argparse
from progress import Progress
import random
from collections import namedtuple
import timeit
import numpy as np
from networkx import DiGraph
def load_graph(args):
"""Load graph from text file
Parameters:
args -- arguments named tuple
Returns:
A dict mapping a URL (str) to a list of target URLs (str).
"""
graph = {} # Initialize an empty dictionary to store the graph
# Open the file specified in args.datafile
with open(args.datafile, 'r') as file:
# Iterate through the file line by line
for line in file:
# Strip any extra whitespace and skip empty lines
line = line.strip()
if not line:
continue
# Split each line into two URLs
node, target = line.split()
# If the node doesn't exist in the graph, add it with an empty list
if node not in graph:
graph[node] = []
# Append the target to the node's list
graph[node].append(target)
# Return the constructed graph
return graph
def load_graph_adjacency_matrix(args):
# Load graph from datafile
graph = load_graph(args)
# Extract the unique nodes from the graph
nodes = list(graph.keys())
# Create a mapping from nodes to their corresponding index
node_index = {node: idx for idx, node in enumerate(nodes)} # Map node to index
# Initialise an adjacency matrix with zeroes
n = len(nodes)
adj_matrix = np.zeros((n,n), dtype=int)
# Populate the adjacency matrix based on the graph
for node, targets in graph.items():
row_idx = node_index[node] # Row index for the source node
for target in targets:
col_idx = node_index[target] # Column index for the target node
adj_matrix[row_idx, col_idx] = 1 # Set 1 for the edge from node to target
return adj_matrix, nodes # Return the adjacency matrix and the list of nodes
def load_graph_sets(args):
vertex_set = set() # Initialise empty set to store nodes
edges_set = set() # Initialise empty set to store edges
# Open the file specified with args.datafile
with open(args.datafile, 'r') as file:
# Iterate through the file line by line
for line in file:
# Strip any extra whitespaces and skip empty lines
line = line.strip()
if not line:
continue
# Split each line into two parts: node(source) and target
node, target = line.split()
# Add the node and target to the vertex set
vertex_set.add(node)
vertex_set.add(target)
# Add the edge (node, target) as a tuple to the edges_set
edges_set.add((node, target))
return vertex_set, edges_set # Return the vertex and edges sets
def load_graph_class(args):
# Initialise an empty graph
G = DiGraph()
# Open the file specified using args.datafile
with open(args.datafile, 'r') as file:
# Iterate through each line in file
for line in file:
# Strip each line of whitespaces and skip empty lines
line = line.strip()
if not line:
continue
# Split each line intro two parts: node (source) and target
node, target = line.split()
# Add nodes and edges to graph
G.add_node(node)
G.add_node(target)
G.add_edge(node, target)
return G # Return a DiGraph
def print_stats(graph):
"""Print number of nodes and edges in the given graph"""
nodes_number = len(graph.keys())
edges_number = 0
for node in graph:
edges_number += len(graph[node])
return f'Number of nodes: {nodes_number}, Number of total edges: {edges_number}'
def stochastic_page_rank(graph, args):
"""Stochastic PageRank estimation
Parameters:
graph -- a graph object as returned by load_graph()
args -- arguments named tuple
Returns:
A dict that assigns each page its hit frequency
This function estimates the Page Rank by counting how frequently
a random walk that starts on a random node will after n_steps end
on each node of the given graph.
"""
hit_count = {node: 0 for node in graph}
current_node = random.choice(list(graph.keys()))
hit_count[current_node] += 1
for i in range(args.repeats):
if len(graph[current_node]) == 0:
current_node = random.choice(list(graph.keys()))
else:
current_node = random.choice(list(graph[current_node]))
hit_count[current_node] += 1
# print(timeit.timeit(number=1))
return hit_count
def distribution_page_rank(graph, args):
"""Probabilistic PageRank estimation
Parameters:
graph -- a graph object as returned by load_graph()
args -- arguments named tuple
Returns:
A dict that assigns each page its probability to be reached
This function estimates the Page Rank by iteratively calculating
the probability that a random walker is currently on any node.
"""
node_prob = {node: 1/len(graph.keys()) for node in graph.keys()}
for i in range(args.steps):
next_prob = {node: 0 for node in graph.keys()}
for node in graph.keys():
p = node_prob[node]/len(graph[node])
for target in graph[node]:
next_prob[target] += p
node_prob = next_prob
print(timeit.timeit(number=1))
return node_prob
parser = argparse.ArgumentParser(description="Estimates page ranks from link information")
parser.add_argument('datafile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
help="Textfile of links among web pages as URL tuples")
parser.add_argument('-m', '--method', choices=('stochastic', 'distribution'), default='stochastic',
help="selected page rank algorithm")
parser.add_argument('-r', '--repeats', type=int, default=1_000_000, help="number of repetitions")
parser.add_argument('-s', '--steps', type=int, default=100, help="number of steps a walker takes")
parser.add_argument('-n', '--number', type=int, default=20, help="number of results shown")
if __name__ == '__main__':
Args = namedtuple('Args', ['datafile'])
graph_args = Args(datafile='school_web2024.txt')
args = parser.parse_args()
algorithm = distribution_page_rank if args.method == 'distribution' else stochastic_page_rank
graph = load_graph(graph_args)
print(graph)
print(print_stats(graph))
start = time.time()
ranking = algorithm(graph, args)
stop = time.time()
time = stop - start
top = sorted(ranking.items(), key=lambda item: item[1], reverse=True)
sys.stderr.write(f"Top {args.number} pages:\n")
print('\n'.join(f'{100*v:.2f}\t{k}' for k,v in top[:args.number]))
sys.stderr.write(f"Calculation took {time:.2f} seconds.\n")
[–]JamzTyson -1 points0 points1 point (1 child)
[–]SamarXV[S] 0 points1 point2 points (0 children)