Context Navigation

source: trunk/UTIL/PYTHON/powerlaw/plfit.py @ 1011

Last change on this file since 1011 was 943, checked in by aslmd, 13 years ago
clean and organized UTIL/PYTHON folder
File size: 14.9 KB

Line
1	from math import *
2
3	# function [alpha, xmin, L]=plfit(x, varargin)
4	# PLFIT fits a power-law distributional model to data.
5	# Source: http://www.santafe.edu/~aaronc/powerlaws/
6	#
7	# PLFIT(x) estimates x_min and alpha according to the goodness-of-fit
8	# based method described in Clauset, Shalizi, Newman (2007). x is a
9	# vector of observations of some quantity to which we wish to fit the
10	# power-law distribution p(x) ~ x^-alpha for x >= xmin.
11	# PLFIT automatically detects whether x is composed of real or integer
12	# values, and applies the appropriate method. For discrete data, if
13	# min(x) > 1000, PLFIT uses the continuous approximation, which is
14	# a reliable in this regime.
15	#
16	# The fitting procedure works as follows:
17	# 1) For each possible choice of x_min, we estimate alpha via the
18	# method of maximum likelihood, and calculate the Kolmogorov-Smirnov
19	# goodness-of-fit statistic D.
20	# 2) We then select as our estimate of x_min, the value that gives the
21	# minimum value D over all values of x_min.
22	#
23	# Note that this procedure gives no estimate of the uncertainty of the
24	# fitted parameters, nor of the validity of the fit.
25	#
26	# Example:
27	# x = [500,150,90,81,75,75,70,65,60,58,49,47,40]
28	# [alpha, xmin, L] = plfit(x)
29	# or a = plfit(x)
30	#
31	# The output 'alpha' is the maximum likelihood estimate of the scaling
32	# exponent, 'xmin' is the estimate of the lower bound of the power-law
33	# behavior, and L is the log-likelihood of the data x>=xmin under the
34	# fitted power law.
35	#
36	# For more information, try 'type plfit'
37	#
38	# See also PLVAR, PLPVA
39
40	# Version 1.0.10 (2010 January)
41	# Copyright (C) 2008-2011 Aaron Clauset (Santa Fe Institute)
42
43	# Ported to Python by Joel Ornstein (2011 July)
44	# (joel_ornstein@hmc.edu)
45
46	# Distributed under GPL 2.0
47	# http://www.gnu.org/copyleft/gpl.html
48	# PLFIT comes with ABSOLUTELY NO WARRANTY
49	#
50	#
51	# The 'zeta' helper function is modified from the open-source library 'mpmath'
52	# mpmath: a Python library for arbitrary-precision floating-point arithmetic
53	# http://code.google.com/p/mpmath/
54	# version 0.17 (February 2011) by Fredrik Johansson and others
55	#
56
57	# Notes:
58	#
59	# 1. In order to implement the integer-based methods in Matlab, the numeric
60	# maximization of the log-likelihood function was used. This requires
61	# that we specify the range of scaling parameters considered. We set
62	# this range to be 1.50 to 3.50 at 0.01 intervals by default.
63	# This range can be set by the user like so,
64	#
65	# a = plfit(x,'range',[1.50,3.50,0.01])
66	#
67	# 2. PLFIT can be told to limit the range of values considered as estimates
68	# for xmin in three ways. First, it can be instructed to sample these
69	# possible values like so,
70	#
71	# a = plfit(x,'sample',100)
72	#
73	# which uses 100 uniformly distributed values on the sorted list of
74	# unique values in the data set. Second, it can simply omit all
75	# candidates above a hard limit, like so
76	#
77	# a = plfit(x,'limit',3.4)
78	#
79	# Finally, it can be forced to use a fixed value, like so
80	#
81	# a = plfit(x,'xmin',3.4)
82	#
83	# In the case of discrete data, it rounds the limit to the nearest
84	# integer.
85	#
86	# 3. When the input sample size is small (e.g., < 100), the continuous
87	# estimator is slightly biased (toward larger values of alpha). To
88	# explicitly use an experimental finite-size correction, call PLFIT like
89	# so
90	#
91	# a = plfit(x,'finite')
92	#
93	# which does a small-size correction to alpha.
94	#
95	# 4. For continuous data, PLFIT can return erroneously large estimates of
96	# alpha when xmin is so large that the number of obs x >= xmin is very
97	# small. To prevent this, we can truncate the search over xmin values
98	# before the finite-size bias becomes significant by calling PLFIT as
99	#
100	# a = plfit(x,'nosmall')
101	#
102	# which skips values xmin with finite size bias > 0.1.
103
104	def plfit(x, *varargin):
105	vec = []
106	sample = []
107	xminx = []
108	limit = []
109	finite = False
110	nosmall = False
111	nowarn = False
112
113	# parse command-line parameters trap for bad input
114	i=0
115	while i<len(varargin):
116	argok = 1
117	if type(varargin[i])==str:
118	if varargin[i]=='range':
119	Range = varargin[i+1]
120	if Range[1]>Range[0]:
121	argok=0
122	vec=[]
123	try:
124	vec=map(lambda X:X*float(Range[2])+Range[0],\
125	range(int((Range[1]-Range[0])/Range[2])))
126
127
128	except:
129	argok=0
130	vec=[]
131
132
133	if Range[0]>=Range[1]:
134	argok=0
135	vec=[]
136	i-=1
137
138	i+=1
139
140
141	elif varargin[i]== 'sample':
142	sample = varargin[i+1]
143	i = i + 1
144	elif varargin[i]== 'limit':
145	limit = varargin[i+1]
146	i = i + 1
147	elif varargin[i]== 'xmin':
148	xminx = varargin[i+1]
149	i = i + 1
150	elif varargin[i]== 'finite': finite = True
151	elif varargin[i]== 'nowarn': nowarn = True
152	elif varargin[i]== 'nosmall': nosmall = True
153	else: argok=0
154
155
156	if not argok:
157	print '(PLFIT) Ignoring invalid argument #',i+1
158
159	i = i+1
160
161	if vec!=[] and (type(vec)!=list or min(vec)<=1):
162	print '(PLFIT) Error: ''range'' argument must contain a vector or minimum <= 1. using default.\n'
163
164	vec = []
165
166	if sample!=[] and sample<2:
167	print'(PLFIT) Error: ''sample'' argument must be a positive integer > 1. using default.\n'
168	sample = []
169
170	if limit!=[] and limit<min(x):
171	print'(PLFIT) Error: ''limit'' argument must be a positive value >= 1. using default.\n'
172	limit = []
173
174	if xminx!=[] and xminx>=max(x):
175	print'(PLFIT) Error: ''xmin'' argument must be a positive value < max(x). using default behavior.\n'
176	xminx = []
177
178
179
180	# select method (discrete or continuous) for fitting
181	if reduce(lambda X,Y:X==True and floor(Y)==float(Y),x,True): f_dattype = 'INTS'
182	elif reduce(lambda X,Y:X==True and (type(Y)==int or type(Y)==float or type(Y)==long),x,True): f_dattype = 'REAL'
183	#else: f_dattype = 'UNKN'
184	##AS
185	else: f_dattype = 'REAL'
186
187	if f_dattype=='INTS' and min(x) > 1000 and len(x)>100:
188	f_dattype = 'REAL'
189
190	print f_dattype
191
192	# estimate xmin and alpha, accordingly
193
194	if f_dattype== 'REAL':
195	xmins = unique(x)
196	xmins.sort()
197	xmins = xmins[0:-1]
198	if xminx!=[]:
199
200	xmins = [min(filter(lambda X: X>=xminx,xmins))]
201
202
203	if limit!=[]:
204	xmins=filter(lambda X: X<=limit,xmins)
205	if xmins==[]: xmins = [min(x)]
206
207	if sample!=[]:
208	step = float(len(xmins))/(sample-1)
209	index_curr=0
210	new_xmins=[]
211	for i in range (0,sample):
212	if round(index_curr)==len(xmins): index_curr-=1
213	new_xmins.append(xmins[int(round(index_curr))])
214	index_curr+=step
215	xmins = unique(new_xmins)
216	xmins.sort()
217
218
219
220	dat = []
221	z = sorted(x)
222
223	for xm in range(0,len(xmins)):
224	xmin = xmins[xm]
225	z = filter(lambda X:X>=xmin,z)
226
227	n = len(z)
228	# estimate alpha using direct MLE
229
230	a = float(n) / sum(map(lambda X: log(float(X)/xmin),z))
231	if nosmall:
232	if (a-1)/sqrt(n) > 0.1 and dat!=[]:
233	xm = len(xmins)+1
234	break
235
236
237	# compute KS statistic
238	#cx = map(lambda X:float(X)/n,range(0,n))
239	cf = map(lambda X:1-pow((float(xmin)/X),a),z)
240	dat.append( max( map(lambda X: abs(cf[X]-float(X)/n),range(0,n))))
241	D = min(dat)
242	xmin = xmins[dat.index(D)]
243	z = filter(lambda X:X>=xmin,x)
244	z.sort()
245	n = len(z)
246	alpha = 1 + n / sum(map(lambda X: log(float(X)/xmin),z))
247	if finite: alpha = alpha*float(n-1)/n+1./n # finite-size correction
248	if n < 50 and not finite and not nowarn:
249	print '(PLFIT) Warning: finite-size bias may be present.\n'
250
251	L = nlog((alpha-1)/xmin) - alphasum(map(lambda X: log(float(X)/xmin),z))
252	elif f_dattype== 'INTS':
253
254	x=map(int,x)
255	if vec==[]:
256	for X in range(150,351):
257	vec.append(X/100.) # covers range of most practical
258	# scaling parameters
259	zvec = map(zeta, vec)
260
261	xmins = unique(x)
262	xmins.sort()
263	xmins = xmins[0:-1]
264	if xminx!=[]:
265	xmins = [min(filter(lambda X: X>=xminx,xmins))]
266
267	if limit!=[]:
268	limit = round(limit)
269	xmins=filter(lambda X: X<=limit,xmins)
270	if xmins==[]: xmins = [min(x)]
271
272	if sample!=[]:
273	step = float(len(xmins))/(sample-1)
274	index_curr=0
275	new_xmins=[]
276	for i in range (0,sample):
277	if round(index_curr)==len(xmins): index_curr-=1
278	new_xmins.append(xmins[int(round(index_curr))])
279	index_curr+=step
280	xmins = unique(new_xmins)
281	xmins.sort()
282
283	if xmins==[]:
284	print '(PLFIT) Error: x must contain at least two unique values.\n'
285	alpha = 'Not a Number'
286	xmin = x[0]
287	D = 'Not a Number'
288	return [alpha,xmin,D]
289
290	xmax = max(x)
291
292	z = x
293	z.sort()
294	datA=[]
295	datB=[]
296
297	for xm in range(0,len(xmins)):
298	xmin = xmins[xm]
299	z = filter(lambda X:X>=xmin,z)
300	n = len(z)
301	# estimate alpha via direct maximization of likelihood function
302
303	# force iterative calculation
304	L = []
305	slogz = sum(map(log,z))
306	xminvec = map(float,range(1,xmin))
307	for k in range(0,len(vec)):
308	L.append(-vec[k]float(slogz) - float(n)log(float(zvec[k]) - sum(map(lambda X:pow(float(X),-vec[k]),xminvec))))
309
310
311	I = L.index(max(L))
312	# compute KS statistic
313	fit = reduce(lambda X,Y: X+[Y+X[-1]],\
314	(map(lambda X: pow(X,-vec[I])/(float(zvec[I])-sum(map(lambda X: pow(X,-vec[I]),map(float,range(1,xmin))))),range(xmin,xmax+1))),[0])[1:]
315	cdi=[]
316	for XM in range(xmin,xmax+1):
317	cdi.append(len(filter(lambda X: floor(X)<=XM,z))/float(n))
318
319	datA.append(max( map(lambda X: abs(fit[X] - cdi[X]),range(0,xmax-xmin+1))))
320	datB.append(vec[I])
321	# select the index for the minimum value of D
322	I = datA.index(min(datA))
323	xmin = xmins[I]
324	z = filter(lambda X:X>=xmin,x)
325	n = len(z)
326	alpha = datB[I]
327	if finite: alpha = alpha*(n-1.)/n+1./n # finite-size correction
328	if n < 50 and not finite and not nowarn:
329	print '(PLFIT) Warning: finite-size bias may be present.\n'
330
331	L = -alphasum(map(log,z)) - nlog(zvec[vec.index(max(filter(lambda X:X<=alpha,vec)))] - \
332	sum(map(lambda X: pow(X,-alpha),range(1,xmin))))
333	else:
334	print '(PLFIT) Error: x must contain only reals or only integers.\n'
335	alpha = []
336	xmin = []
337	L = []
338
339	return [alpha,xmin,L]
340
341
342	# helper functions (unique and zeta)
343
344
345	def unique(seq):
346	# not order preserving
347	set = {}
348	map(set.__setitem__, seq, [])
349	return set.keys()
350
351	def _polyval(coeffs, x):
352	p = coeffs[0]
353	for c in coeffs[1:]:
354	p = c + x*p
355	return p
356
357	_zeta_int = [\
358	-0.5,
359	0.0,
360	1.6449340668482264365,1.2020569031595942854,1.0823232337111381915,
361	1.0369277551433699263,1.0173430619844491397,1.0083492773819228268,
362	1.0040773561979443394,1.0020083928260822144,1.0009945751278180853,
363	1.0004941886041194646,1.0002460865533080483,1.0001227133475784891,
364	1.0000612481350587048,1.0000305882363070205,1.0000152822594086519,
365	1.0000076371976378998,1.0000038172932649998,1.0000019082127165539,
366	1.0000009539620338728,1.0000004769329867878,1.0000002384505027277,
367	1.0000001192199259653,1.0000000596081890513,1.0000000298035035147,
368	1.0000000149015548284]
369
370	_zeta_P = [-3.50000000087575873, -0.701274355654678147,
371	-0.0672313458590012612, -0.00398731457954257841,
372	-0.000160948723019303141, -4.67633010038383371e-6,
373	-1.02078104417700585e-7, -1.68030037095896287e-9,
374	-1.85231868742346722e-11][::-1]
375
376	_zeta_Q = [1.00000000000000000, -0.936552848762465319,
377	-0.0588835413263763741, -0.00441498861482948666,
378	-0.000143416758067432622, -5.10691659585090782e-6,
379	-9.58813053268913799e-8, -1.72963791443181972e-9,
380	-1.83527919681474132e-11][::-1]
381
382	_zeta_1 = [3.03768838606128127e-10, -1.21924525236601262e-8,
383	2.01201845887608893e-7, -1.53917240683468381e-6,
384	-5.09890411005967954e-7, 0.000122464707271619326,
385	-0.000905721539353130232, -0.00239315326074843037,
386	0.084239750013159168, 0.418938517907442414, 0.500000001921884009]
387
388	_zeta_0 = [-3.46092485016748794e-10, -6.42610089468292485e-9,
389	1.76409071536679773e-7, -1.47141263991560698e-6, -6.38880222546167613e-7,
390	0.000122641099800668209, -0.000905894913516772796, -0.00239303348507992713,
391	0.0842396947501199816, 0.418938533204660256, 0.500000000000000052]
392
393	def zeta(s):
394	"""
395	Riemann zeta function, real argument
396	"""
397	if not isinstance(s, (float, int)):
398	try:
399	s = float(s)
400	except (ValueError, TypeError):
401	try:
402	s = complex(s)
403	if not s.imag:
404	return complex(zeta(s.real))
405	except (ValueError, TypeError):
406	pass
407	raise NotImplementedError
408	if s == 1:
409	raise ValueError("zeta(1) pole")
410	if s >= 27:
411	return 1.0 + 2.0(-s) + 3.0(-s)
412	n = int(s)
413	if n == s:
414	if n >= 0:
415	return _zeta_int[n]
416	if not (n % 2):
417	return 0.0
418	if s <= 0.0:
419	return 0
420	if s <= 2.0:
421	if s <= 1.0:
422	return _polyval(_zeta_0,s)/(s-1)
423	return _polyval(_zeta_1,s)/(s-1)
424	z = _polyval(_zeta_P,s) / _polyval(_zeta_Q,s)
425	return 1.0 + 2.0(-s) + 3.0(-s) + 4.0*(-s)z
426
427
428
429

Note: See TracBrowser for help on using the repository browser.

Download in other formats: