1 | from math import * |
---|
2 | from random import * |
---|
3 | |
---|
4 | # function [alpha, xmin, n]=plpva(x, xmin, varargin) |
---|
5 | # PLPVA calculates the p-value for the given power-law fit to some data. |
---|
6 | # Source: http://www.santafe.edu/~aaronc/powerlaws/ |
---|
7 | # |
---|
8 | # PLPVA(x, xmin) takes data x and given lower cutoff for the power-law |
---|
9 | # behavior xmin and computes the corresponding p-value for the |
---|
10 | # Kolmogorov-Smirnov test, according to the method described in |
---|
11 | # Clauset, Shalizi, Newman (2007). |
---|
12 | # PLPVA automatically detects whether x is composed of real or integer |
---|
13 | # values, and applies the appropriate method. For discrete data, if |
---|
14 | # min(x) > 1000, PLPVA uses the continuous approximation, which is |
---|
15 | # a reliable in this regime. |
---|
16 | # |
---|
17 | # The fitting procedure works as follows: |
---|
18 | # 1) For each possible choice of x_min, we estimate alpha via the |
---|
19 | # method of maximum likelihood, and calculate the Kolmogorov-Smirnov |
---|
20 | # goodness-of-fit statistic D. |
---|
21 | # 2) We then select as our estimate of x_min, the value that gives the |
---|
22 | # minimum value D over all values of x_min. |
---|
23 | # |
---|
24 | # Note that this procedure gives no estimate of the uncertainty of the |
---|
25 | # fitted parameters, nor of the validity of the fit. |
---|
26 | # |
---|
27 | # Example: |
---|
28 | # x = [500,150,90,81,75,75,70,65,60,58,49,47,40] |
---|
29 | # [p, gof] = plpva(x, 1); |
---|
30 | # |
---|
31 | # For more information, try 'type plpva' |
---|
32 | # |
---|
33 | # See also PLFIT, PLVAR |
---|
34 | |
---|
35 | # Version 1.0.7 (2009 October) |
---|
36 | # Copyright (C) 2008-2011 Aaron Clauset (Santa Fe Institute) |
---|
37 | |
---|
38 | # Ported to Python by Joel Ornstein (2011 July) |
---|
39 | #(joel_ornstein@hmc.edu) |
---|
40 | |
---|
41 | # Distributed under GPL 2.0 |
---|
42 | # http://www.gnu.org/copyleft/gpl.html |
---|
43 | # PLPVA comes with ABSOLUTELY NO WARRANTY |
---|
44 | # |
---|
45 | # |
---|
46 | # The 'zeta' helper function is modified from the open-source library 'mpmath' |
---|
47 | # mpmath: a Python library for arbitrary-precision floating-point arithmetic |
---|
48 | # http://code.google.com/p/mpmath/ |
---|
49 | # version 0.17 (February 2011) by Fredrik Johansson and others |
---|
50 | # |
---|
51 | |
---|
52 | # Notes: |
---|
53 | # |
---|
54 | # 1. In order to implement the integer-based methods in Matlab, the numeric |
---|
55 | # maximization of the log-likelihood function was used. This requires |
---|
56 | # that we specify the range of scaling parameters considered. We set |
---|
57 | # this range to be 1.50 to 3.50 at 0.01 intervals by default. |
---|
58 | # This range can be set by the user like so, |
---|
59 | # |
---|
60 | # x = [500,150,90,81,75,75,70,65,60,58,49,47,40] |
---|
61 | # a = plpva(x,1,'range',[1.50,3.50,0.01]) |
---|
62 | # |
---|
63 | # 2. PLPVA can be told to limit the range of values considered as estimates |
---|
64 | # for xmin in two ways. First, it can be instructed to sample these |
---|
65 | # possible values like so, |
---|
66 | # |
---|
67 | # a = plpva(x,1,'sample',100); |
---|
68 | # |
---|
69 | # which uses 100 uniformly distributed values on the sorted list of |
---|
70 | # unique values in the data set. Second, it can simply omit all |
---|
71 | # candidates above a hard limit, like so |
---|
72 | # |
---|
73 | # a = plpva(x,1,'limit',3.4); |
---|
74 | # |
---|
75 | # Finally, it can be forced to use a fixed value, like so |
---|
76 | # |
---|
77 | # a = plpva(x,1,'xmin',1); |
---|
78 | # |
---|
79 | # In the case of discrete data, it rounds the limit to the nearest |
---|
80 | # integer. |
---|
81 | # |
---|
82 | # 3. The default number of semiparametric repetitions of the fitting |
---|
83 | # procedure is 1000. This number can be changed like so |
---|
84 | # |
---|
85 | # p = plpva(x, 1,'reps',10000); |
---|
86 | # |
---|
87 | # 4. To silence the textual output to the screen, do this |
---|
88 | # |
---|
89 | # p = plpva(x, 1,'reps',10000,'silent'); |
---|
90 | # |
---|
91 | |
---|
92 | def plpva(x,xmin, *varargin): |
---|
93 | vec = [] |
---|
94 | sample = [] |
---|
95 | xminx = [] |
---|
96 | limit = [] |
---|
97 | Bt = [] |
---|
98 | quiet = False |
---|
99 | |
---|
100 | |
---|
101 | # parse command-line parameters trap for bad input |
---|
102 | i=0 |
---|
103 | while i<len(varargin): |
---|
104 | argok = 1 |
---|
105 | if type(varargin[i])==str: |
---|
106 | if varargin[i]=='range': |
---|
107 | Range = varargin[i+1] |
---|
108 | if Range[1]>Range[0]: |
---|
109 | argok=0 |
---|
110 | vec=[] |
---|
111 | try: |
---|
112 | vec=map(lambda X:X*float(Range[2])+Range[0],\ |
---|
113 | range(int((Range[1]-Range[0])/Range[2]))) |
---|
114 | |
---|
115 | |
---|
116 | except: |
---|
117 | argok=0 |
---|
118 | vec=[] |
---|
119 | |
---|
120 | |
---|
121 | if Range[0]>=Range[1]: |
---|
122 | argok=0 |
---|
123 | vec=[] |
---|
124 | i-=1 |
---|
125 | |
---|
126 | i+=1 |
---|
127 | |
---|
128 | |
---|
129 | elif varargin[i]== 'sample': |
---|
130 | sample = varargin[i+1] |
---|
131 | i = i + 1 |
---|
132 | elif varargin[i]== 'limit': |
---|
133 | limit = varargin[i+1] |
---|
134 | i = i + 1 |
---|
135 | elif varargin[i]== 'xmin': |
---|
136 | xminx = varargin[i+1] |
---|
137 | i = i + 1 |
---|
138 | elif varargin[i]== 'reps': |
---|
139 | Bt = varargin[i+1] |
---|
140 | i = i + 1 |
---|
141 | elif varargin[i]== 'silent': quiet = True |
---|
142 | |
---|
143 | else: argok=0 |
---|
144 | |
---|
145 | |
---|
146 | if not argok: |
---|
147 | print '(PLPVA) Ignoring invalid argument #',i+1 |
---|
148 | |
---|
149 | i = i+1 |
---|
150 | |
---|
151 | if vec!=[] and (type(vec)!=list or min(vec)<=1): |
---|
152 | print '(PLPVA) Error: ''range'' argument must contain a vector or minimum <= 1. using default.\n' |
---|
153 | |
---|
154 | vec = [] |
---|
155 | |
---|
156 | if sample!=[] and sample<2: |
---|
157 | print'(PLPVA) Error: ''sample'' argument must be a positive integer > 1. using default.\n' |
---|
158 | sample = [] |
---|
159 | |
---|
160 | if limit!=[] and limit<min(x): |
---|
161 | print'(PLPVA) Error: ''limit'' argument must be a positive value >= 1. using default.\n' |
---|
162 | limit = [] |
---|
163 | |
---|
164 | if xminx!=[] and xminx>=max(x): |
---|
165 | print'(PLPVA) Error: ''xmin'' argument must be a positive value < max(x). using default behavior.\n' |
---|
166 | xminx = [] |
---|
167 | if Bt!=[] and Bt<2: |
---|
168 | print '(PLPVA) Error: ''reps'' argument must be a positive value > 1; using default.\n' |
---|
169 | Bt = []; |
---|
170 | |
---|
171 | |
---|
172 | # select method (discrete or continuous) for fitting |
---|
173 | if reduce(lambda X,Y:X==True and floor(Y)==float(Y),x,True): f_dattype = 'INTS' |
---|
174 | elif reduce(lambda X,Y:X==True and (type(Y)==int or type(Y)==float or type(Y)==long),x,True): f_dattype = 'REAL' |
---|
175 | else: f_dattype = 'UNKN' |
---|
176 | |
---|
177 | if f_dattype=='INTS' and min(x) > 1000 and len(x)>100: |
---|
178 | f_dattype = 'REAL' |
---|
179 | |
---|
180 | N=len(x) |
---|
181 | |
---|
182 | if Bt==[]: Bt=1000 |
---|
183 | nof = [] |
---|
184 | if not quiet: |
---|
185 | print 'Power-law Distribution, p-value calculation\n' |
---|
186 | print ' Copyright 2007-2010 Aaron Clauset\n' |
---|
187 | print ' Warning: This can be a slow calculation; please be patient.\n' |
---|
188 | print ' n = ',len(x),'\n xmin = ',xmin,'\n reps = ',Bt |
---|
189 | # estimate xmin and alpha, accordingly |
---|
190 | if f_dattype== 'REAL': |
---|
191 | # compute D for the empirical distribution |
---|
192 | z = filter(lambda X:X>=xmin,x) |
---|
193 | z.sort() |
---|
194 | nz = len(z) |
---|
195 | y = filter(lambda X:X<xmin,x) |
---|
196 | ny = len(y) |
---|
197 | alpha = 1 + float(nz) / sum(map(lambda X: log(float(X)/xmin),z)) |
---|
198 | cz = map(lambda X:X/float(nz),range(0,nz)) |
---|
199 | cf = map(lambda X:1-pow((xmin/float(X)),(alpha-1)),z) |
---|
200 | gof = max( map(lambda X:abs(cz[X] - cf[X]),range(0,nz))) |
---|
201 | pz = nz/float(N) |
---|
202 | |
---|
203 | # compute distribution of gofs from semi-parametric bootstrap |
---|
204 | # of entire data set with fit |
---|
205 | for B in range(0,Bt): |
---|
206 | # semi-parametric bootstrap of data |
---|
207 | n1=0 |
---|
208 | for i in range(0,N): |
---|
209 | if random()>pz: n1+=1 |
---|
210 | |
---|
211 | q1=[] |
---|
212 | for i in range(0,n1): |
---|
213 | q1.append(y[int(floor(ny*random()))]) |
---|
214 | n2 = N-n1; |
---|
215 | q2=[] |
---|
216 | for i in range (0,n2): |
---|
217 | q2.append(random()) |
---|
218 | q2 = map(lambda X:xmin*pow(1.-X,(-1./(alpha-1.))),q2) |
---|
219 | |
---|
220 | q = q1+q2 |
---|
221 | q.sort() |
---|
222 | |
---|
223 | |
---|
224 | |
---|
225 | # estimate xmin and alpha via GoF-method |
---|
226 | qmins = unique(q); |
---|
227 | qmins.sort() |
---|
228 | qmins = qmins[0:-1]; |
---|
229 | |
---|
230 | if xminx!=[]: |
---|
231 | |
---|
232 | qmins = [min(filter(lambda X: X>=xminx,qmins))] |
---|
233 | |
---|
234 | |
---|
235 | if limit!=[]: |
---|
236 | qmins=filter(lambda X: X<=limit,qmins) |
---|
237 | if qmins==[]: qmins=[min(q)] |
---|
238 | |
---|
239 | if sample!=[]: |
---|
240 | step = float(len(qmins))/(sample-1) |
---|
241 | index_curr=0 |
---|
242 | new_qmins=[] |
---|
243 | for i in range (0,sample): |
---|
244 | if round(index_curr)==len(qmins): index_curr-=1 |
---|
245 | new_qmins.append(qmins[int(round(index_curr))]) |
---|
246 | index_curr+=step |
---|
247 | qmins = unique(new_qmins) |
---|
248 | qmins.sort() |
---|
249 | |
---|
250 | |
---|
251 | |
---|
252 | dat = [] |
---|
253 | |
---|
254 | for qm in range(0,len(qmins)): |
---|
255 | qmin = qmins[qm] |
---|
256 | zq = filter(lambda X:X>=qmin,q) |
---|
257 | nq = len(zq) |
---|
258 | |
---|
259 | a = float(nq)/sum(map(lambda X: log(float(X)/qmin),zq)) |
---|
260 | cf = map(lambda X:1-pow((float(qmin)/X),a),zq) |
---|
261 | |
---|
262 | dat.append( max( map(lambda X: abs(cf[X]-float(X)/nq),range(0,nq)))) |
---|
263 | if not quiet: |
---|
264 | print '['+str(B+1)+']\tp = ',round(len(filter(lambda X:X>=gof,nof))/float(B+1),3) |
---|
265 | nof.append(min(dat)) |
---|
266 | p=len(filter(lambda X:X>=gof,nof))/float(Bt+1) |
---|
267 | |
---|
268 | elif f_dattype== 'INTS': |
---|
269 | x=map(int,x) |
---|
270 | if vec==[]: |
---|
271 | for X in range(150,351): |
---|
272 | vec.append(X/100.) # covers range of most practical |
---|
273 | # scaling parameters |
---|
274 | zvec = map(zeta, vec) |
---|
275 | z = filter (lambda X:X>=xmin,x) |
---|
276 | nz = len(z) |
---|
277 | y = filter (lambda X:X<xmin,x) |
---|
278 | ny = len(y) |
---|
279 | xmax=max(z) |
---|
280 | L = [] |
---|
281 | for k in range(0,len(vec)): |
---|
282 | L.append(-vec[k]*float(sum(map(log,z))) - float(nz)*log(float(zvec[k]) - sum(map(lambda X:pow(float(X),-vec[k]),range(1,xmin))))) |
---|
283 | I = L.index(max(L)) |
---|
284 | alpha=vec[I] |
---|
285 | fit = reduce(lambda X,Y: X+[Y+X[-1]],\ |
---|
286 | (map(lambda X: pow(X,-alpha)/(float(zvec[I])-sum(map(lambda X: pow(X,-alpha),map(float,range(1,xmin))))),range(xmin,xmax+1))),[0])[1:] |
---|
287 | cdi=[] |
---|
288 | for XM in range(xmin,xmax+1): |
---|
289 | cdi.append(len(filter(lambda X: floor(X)<=XM,z))/float(nz)) |
---|
290 | |
---|
291 | gof= max( map(lambda X: abs(fit[X] - cdi[X]),range(0,xmax-xmin+1))) |
---|
292 | pz=nz/float(N) |
---|
293 | mmax = 20*xmax |
---|
294 | pdf=[] |
---|
295 | for i in range(0,xmin): |
---|
296 | pdf.append(0) |
---|
297 | pdf += map(lambda X: pow(X,-alpha)/(float(zvec[I])-sum(map(lambda X: pow(X,-alpha),map(float,range(1,xmin))))),range(xmin,mmax+1)) |
---|
298 | cdf = reduce(lambda X,Y: X+[Y+X[-1]],pdf,[0])[1:]+[1] |
---|
299 | |
---|
300 | # compute distribution of gofs from semi-parametric bootstrap |
---|
301 | # of entire data set with fit |
---|
302 | for B in range(0,Bt): |
---|
303 | # semi-parametric bootstrap of data |
---|
304 | n1=0 |
---|
305 | for i in range(0,N): |
---|
306 | if random()>pz: n1+=1 |
---|
307 | |
---|
308 | q1=[] |
---|
309 | for i in range(0,n1): |
---|
310 | q1.append( y[int(floor(ny*random()))]) |
---|
311 | n2 = N-n1; |
---|
312 | r2=[] |
---|
313 | for i in range (0,n2): |
---|
314 | r2.append(random()) |
---|
315 | r2.sort() |
---|
316 | c=0 |
---|
317 | k=0 |
---|
318 | q2=[] |
---|
319 | for i in range(xmin,mmax+2): |
---|
320 | while c<n2 and r2[c]<=cdf[i]:c+=1 |
---|
321 | for k in range(k,c):q2.append(i) |
---|
322 | k=c |
---|
323 | if k>=n2: break |
---|
324 | |
---|
325 | q = q1+q2 |
---|
326 | qmins = unique(q) |
---|
327 | qmins.sort() |
---|
328 | qmins = qmins[0:-1] |
---|
329 | |
---|
330 | if xminx!=[]: |
---|
331 | |
---|
332 | qmins = [min(filter(lambda X: X>=xminx,qmins))] |
---|
333 | |
---|
334 | |
---|
335 | if limit!=[]: |
---|
336 | qmins=filter(lambda X: X<=limit,qmins) |
---|
337 | if qmins==[]: qmins=[min(q)] |
---|
338 | |
---|
339 | if sample!=[]: |
---|
340 | step = float(len(qmins))/(sample-1) |
---|
341 | index_curr=0 |
---|
342 | new_qmins=[] |
---|
343 | for i in range (0,sample): |
---|
344 | if round(index_curr)==len(qmins): index_curr-=1 |
---|
345 | new_qmins.append(qmins[int(round(index_curr))]) |
---|
346 | index_curr+=step |
---|
347 | qmins = unique(new_qmins) |
---|
348 | qmins.sort() |
---|
349 | |
---|
350 | qmax = max(q) |
---|
351 | dat=[] |
---|
352 | zq=q |
---|
353 | |
---|
354 | |
---|
355 | for qm in range(0,len(qmins)): |
---|
356 | qmin = qmins[qm] |
---|
357 | zq = filter(lambda X:X>=qmin,zq) |
---|
358 | nq = len(zq) |
---|
359 | if nq>1: |
---|
360 | L=[] |
---|
361 | slogzq = sum(map(log,zq)) |
---|
362 | qminvec = range (1,qmin) |
---|
363 | for k in range (1,len(vec)): |
---|
364 | L.append(-vec[k]*float(slogzq) - float(nq)*log(float(zvec[k]) - sum(map(lambda X:pow(float(X),-vec[k]),qminvec)))) |
---|
365 | |
---|
366 | I = L.index(max(L)) |
---|
367 | fit = reduce(lambda X,Y: X+[Y+X[-1]],\ |
---|
368 | (map(lambda X: pow(X,-vec[I])/(float(zvec[I])-sum(map(lambda X: pow(X,-vec[I]),map(float,range(1,qmin))))),range(qmin,qmax+1))),[0])[1:] |
---|
369 | cdi=[] |
---|
370 | for QM in range(qmin,qmax+1): |
---|
371 | cdi.append(len(filter(lambda X: floor(X)<=QM,zq))/float(nq)) |
---|
372 | |
---|
373 | dat.append(max( map(lambda X: abs(fit[X] - cdi[X]),range(0,qmax-qmin+1)))) |
---|
374 | else: dat[qm]=float('-inf') |
---|
375 | if not quiet: |
---|
376 | print '['+str(B+1)+']\tp = ',round(len(filter(lambda X:X>=gof,nof))/float(B+1),3) |
---|
377 | nof.append(min(dat)) |
---|
378 | |
---|
379 | p=len(filter(lambda X:X>=gof,nof))/float(Bt) |
---|
380 | else: |
---|
381 | print '(PLPVA) Error: x must contain only reals or only integers.\n' |
---|
382 | p = [] |
---|
383 | gof = [] |
---|
384 | |
---|
385 | |
---|
386 | return [p,gof] |
---|
387 | |
---|
388 | |
---|
389 | # helper functions (unique and zeta) |
---|
390 | |
---|
391 | |
---|
392 | def unique(seq): |
---|
393 | # not order preserving |
---|
394 | set = {} |
---|
395 | map(set.__setitem__, seq, []) |
---|
396 | return set.keys() |
---|
397 | |
---|
398 | def _polyval(coeffs, x): |
---|
399 | p = coeffs[0] |
---|
400 | for c in coeffs[1:]: |
---|
401 | p = c + x*p |
---|
402 | return p |
---|
403 | |
---|
404 | _zeta_int = [\ |
---|
405 | -0.5, |
---|
406 | 0.0, |
---|
407 | 1.6449340668482264365,1.2020569031595942854,1.0823232337111381915, |
---|
408 | 1.0369277551433699263,1.0173430619844491397,1.0083492773819228268, |
---|
409 | 1.0040773561979443394,1.0020083928260822144,1.0009945751278180853, |
---|
410 | 1.0004941886041194646,1.0002460865533080483,1.0001227133475784891, |
---|
411 | 1.0000612481350587048,1.0000305882363070205,1.0000152822594086519, |
---|
412 | 1.0000076371976378998,1.0000038172932649998,1.0000019082127165539, |
---|
413 | 1.0000009539620338728,1.0000004769329867878,1.0000002384505027277, |
---|
414 | 1.0000001192199259653,1.0000000596081890513,1.0000000298035035147, |
---|
415 | 1.0000000149015548284] |
---|
416 | |
---|
417 | _zeta_P = [-3.50000000087575873, -0.701274355654678147, |
---|
418 | -0.0672313458590012612, -0.00398731457954257841, |
---|
419 | -0.000160948723019303141, -4.67633010038383371e-6, |
---|
420 | -1.02078104417700585e-7, -1.68030037095896287e-9, |
---|
421 | -1.85231868742346722e-11][::-1] |
---|
422 | |
---|
423 | _zeta_Q = [1.00000000000000000, -0.936552848762465319, |
---|
424 | -0.0588835413263763741, -0.00441498861482948666, |
---|
425 | -0.000143416758067432622, -5.10691659585090782e-6, |
---|
426 | -9.58813053268913799e-8, -1.72963791443181972e-9, |
---|
427 | -1.83527919681474132e-11][::-1] |
---|
428 | |
---|
429 | _zeta_1 = [3.03768838606128127e-10, -1.21924525236601262e-8, |
---|
430 | 2.01201845887608893e-7, -1.53917240683468381e-6, |
---|
431 | -5.09890411005967954e-7, 0.000122464707271619326, |
---|
432 | -0.000905721539353130232, -0.00239315326074843037, |
---|
433 | 0.084239750013159168, 0.418938517907442414, 0.500000001921884009] |
---|
434 | |
---|
435 | _zeta_0 = [-3.46092485016748794e-10, -6.42610089468292485e-9, |
---|
436 | 1.76409071536679773e-7, -1.47141263991560698e-6, -6.38880222546167613e-7, |
---|
437 | 0.000122641099800668209, -0.000905894913516772796, -0.00239303348507992713, |
---|
438 | 0.0842396947501199816, 0.418938533204660256, 0.500000000000000052] |
---|
439 | |
---|
440 | def zeta(s): |
---|
441 | """ |
---|
442 | Riemann zeta function, real argument |
---|
443 | """ |
---|
444 | if not isinstance(s, (float, int)): |
---|
445 | try: |
---|
446 | s = float(s) |
---|
447 | except (ValueError, TypeError): |
---|
448 | try: |
---|
449 | s = complex(s) |
---|
450 | if not s.imag: |
---|
451 | return complex(zeta(s.real)) |
---|
452 | except (ValueError, TypeError): |
---|
453 | pass |
---|
454 | raise NotImplementedError |
---|
455 | if s == 1: |
---|
456 | raise ValueError("zeta(1) pole") |
---|
457 | if s >= 27: |
---|
458 | return 1.0 + 2.0**(-s) + 3.0**(-s) |
---|
459 | n = int(s) |
---|
460 | if n == s: |
---|
461 | if n >= 0: |
---|
462 | return _zeta_int[n] |
---|
463 | if not (n % 2): |
---|
464 | return 0.0 |
---|
465 | if s <= 0.0: |
---|
466 | return 0 |
---|
467 | if s <= 2.0: |
---|
468 | if s <= 1.0: |
---|
469 | return _polyval(_zeta_0,s)/(s-1) |
---|
470 | return _polyval(_zeta_1,s)/(s-1) |
---|
471 | z = _polyval(_zeta_P,s) / _polyval(_zeta_Q,s) |
---|
472 | return 1.0 + 2.0**(-s) + 3.0**(-s) + 4.0**(-s)*z |
---|
473 | |
---|
474 | |
---|
475 | |
---|
476 | |
---|