分布模型與排名估計分析

1. 核心問題
給定一個具體數值佔比,如何通過不同的(離散)分布模型來估計其可能的排名位置
2. 核心流程
確定模型與參數
實現模型分布
設置模型參數(α, β 等)
計算各排序數值佔比
計算各排名位置的數值
將數值轉換為數值佔比
尋找目標排名
尋找最接近目標數值佔比的排名數值佔比
評估估計誤差範圍
3. 模型比較
| 模型 | 核心參數 | 分布特征 | 主要優點 | 限制 |
| Power Law | α:幂律指數 | - 嚴格的長尾特性 - 線性衰減(log尺度) | - 能準確描述極端不平等 - 易於理解和解釋 | - 無法描述複雜的分布變化 - 在中間區域擬合度較差 |
| Stretched Exponential | β:拉伸參數 x₀:特徵尺度 | - 較快的初始衰減 - 溫和的尾部 | - 可描述漸進轉變 - 平滑的分布形態 | - 難以捕捉突變點 - 參數物理含義不直觀 |
4. 實現代碼
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import List, Dict, Tuple
@dataclass
class DPLNParams:
alpha: float
mu: float
sigma: float
label: str
class DistributionModels:
@staticmethod
def power_law(x: np.ndarray, alpha: float) -> np.ndarray:
"""Power law distribution for value percentages"""
values = (x.astype(float)) ** (-alpha)
return values * 100 / np.sum(values) # Convert to percentage
@staticmethod
def stretched_exponential(x: np.ndarray, beta: float, x0: float = 1) -> np.ndarray:
"""Stretched exponential distribution for value percentages"""
values = np.exp(-(x/x0) ** beta)
return values * 100 / np.sum(values) # Convert to percentage
class DistributionAnalyzer:
def __init__(self, target_percentage: float, N: int = 10000, max_rank: int = 200):
self.target_percentage = target_percentage
self.N = N
self.max_rank = max_rank
self.ranks = np.arange(1, max_rank + 1)
self.models = DistributionModels()
def calculate_distribution(self,
distribution_func: callable,
params: dict,
ranks: np.ndarray = None) -> np.ndarray:
"""Calculate value percentage distribution"""
if ranks is None:
ranks = self.ranks
return distribution_func(ranks, **params)
def find_closest_rank(self,
target_percentage: float,
distribution_func: callable,
params: dict) -> Tuple[int, float, float]:
"""
Find the rank closest to target percentage
Returns:
Tuple[int, float, float]: (rank, actual_percentage, error)
"""
percentages = self.calculate_distribution(distribution_func, params)
differences = np.abs(percentages - target_percentage)
best_rank_idx = np.argmin(differences)
return (
best_rank_idx + 1, # rank
percentages[best_rank_idx], # actual percentage
differences[best_rank_idx] # error
)
def create_visualization(self,
alphas: List[float],
betas: List[float],
figsize: Tuple[int, int] = (15, 6)) -> Tuple[plt.Figure, Dict]:
"""
Create visualization and return analysis results
Returns:
Tuple[plt.Figure, Dict]: (figure, results_dictionary)
"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
results = {'power_law': [], 'stretched_exp': []}
# Power Law analysis and plot
for alpha in alphas:
rank, actual, error = self.find_closest_rank(
self.target_percentage,
self.models.power_law,
{'alpha': alpha}
)
results['power_law'].append({
'alpha': alpha,
'rank': rank,
'actual_percentage': actual,
'error': error
})
percentages = self.calculate_distribution(
self.models.power_law, {'alpha': alpha}
)
ax1.scatter(self.ranks, percentages, alpha=0.6, label=f'α={alpha}')
self._format_axis(ax1, 'Power Law')
# Stretched Exponential analysis and plot
for beta in betas:
rank, actual, error = self.find_closest_rank(
self.target_percentage,
self.models.stretched_exponential,
{'beta': beta}
)
results['stretched_exp'].append({
'beta': beta,
'rank': rank,
'actual_percentage': actual,
'error': error
})
percentages = self.calculate_distribution(
self.models.stretched_exponential, {'beta': beta}
)
ax2.scatter(self.ranks, percentages, alpha=0.6, label=f'β={beta}')
self._format_axis(ax2, 'Stretched Exponential')
plt.tight_layout()
return fig, results
def _format_axis(self, ax: plt.Axes, title: str):
"""Format plot axis with common settings"""
ax.axhline(y=self.target_percentage, color='r',
linestyle='--', label=f'target ({self.target_percentage:.2f}%)')
ax.set_title(title)
ax.set_xlabel('Rank')
ax.set_ylabel('Value Percentage')
ax.set_yscale('log')
ax.set_xlim(0, self.max_rank)
ax.grid(True)
ax.legend()
# Example usage
if __name__ == "__main__":
# Parameters
alphas = [1.0, 1.2, 1.4]
betas = [0.4, 0.5, 0.6]
# Create analyzer and generate plots with results
analyzer = DistributionAnalyzer(0.67)
fig, results = analyzer.create_visualization(alphas, betas)
# Print analysis results
print("\nPower Law Results:")
for result in results['power_law']:
print(f"α={result['alpha']:.1f}: "
f"Rank={result['rank']}, "
f"Actual={result['actual_percentage']:.2f}%, "
f"Error={result['error']:.2f}%")
print("\nStretched Exponential Results:")
for result in results['stretched_exp']:
print(f"β={result['beta']:.1f}: "
f"Rank={result['rank']}, "
f"Actual={result['actual_percentage']:.2f}%, "
f"Error={result['error']:.2f}%")
plt.show()