This work has been published by Mehdi Rezvandehy and Clayton V. Deutsch at Journal of Stochastic Environmental Research and Risk Assessment https://doi.org/10.1007/s00477-017-1388-x. Here is a Python implementation with realistic data for declustering spatial correlation of data.
Summary¶
The spatial correlation called variogram is a key parameter for geostatistical estimation and simulation. Preferential sampling may bias the spatial structure and often leads to noisy and unreliable variograms. A novel technique is proposed to weight variogram pairs in order to compensate for preferential or clustered sampling. Weighting the variogram pairs by global kriging of the quadratic differences between the tail and head values gives each pair the appropriate weight, removes noise and minimizes artifacts in the experimental variogram. Moreover, variogram uncertainty could be computed by this technique. The required covariance between the pairs going into variogram calculation, is a fourth order covariance that must be calculated by second order moments.
A realistic example is presented for variaogram declustering and kriging estimation. The data for this example is the data "cluster.dat" taken from the literature literature
Implement Declustering in Python¶
import os
os.environ['path']+=";.\\Codes" # All required codes are in this folder
import pandas as pd
import numpy as np
import matplotlib
import pylab as plt
import geostatspy.GSLIB as GSLIB
import geostatspy.geostats as geostats
from IPython.display import clear_output
from matplotlib import gridspec
from matplotlib.offsetbox import AnchoredText
import warnings
warnings.filterwarnings('ignore')
You should first run Appendix that includes all required python functions
The Cluster Data Set¶
# Looking at the data
Cluster=GSLIB_View('Cluster.dat')
display(Cluster[0:10])
Xlocation | Ylocation | Primary | Secondary | Declustering | NS:Primary | |
---|---|---|---|---|---|---|
0 | 39.5 | 18.5 | 0.06 | 0.22 | 1.619 | -2.52518 |
1 | 5.5 | 1.5 | 0.06 | 0.27 | 1.619 | -2.11193 |
2 | 38.5 | 5.5 | 0.08 | 0.40 | 1.416 | -1.90816 |
3 | 20.5 | 1.5 | 0.09 | 0.39 | 1.821 | -1.65088 |
4 | 27.5 | 14.5 | 0.09 | 0.24 | 1.349 | -1.77366 |
5 | 40.5 | 21.5 | 0.10 | 0.48 | 0.944 | -1.56105 |
6 | 15.5 | 3.5 | 0.10 | 0.21 | 1.214 | -1.49880 |
7 | 6.5 | 25.5 | 0.11 | 0.36 | 1.619 | -1.42499 |
8 | 38.5 | 21.5 | 0.11 | 0.22 | 1.146 | -1.35975 |
9 | 23.5 | 18.5 | 0.16 | 0.30 | 1.821 | -1.29563 |
Cluster.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Xlocation 140 non-null float64 1 Ylocation 140 non-null float64 2 Primary 140 non-null float64 3 Secondary 140 non-null float64 4 Declustering 140 non-null float64 5 NS:Primary 140 non-null float64 dtypes: float64(6) memory usage: 6.7 KB
Cluster.describe()
Xlocation | Ylocation | Primary | Secondary | Declustering | NS:Primary | |
---|---|---|---|---|---|---|
count | 140.000000 | 140.000000 | 140.000000 | 140.000000 | 140.000000 | 140.000000 |
mean | 23.321429 | 25.607143 | 4.350429 | 4.140214 | 1.000021 | 0.393344 |
std | 14.461692 | 13.594697 | 6.726625 | 4.508008 | 0.542023 | 1.091754 |
min | 0.500000 | 0.500000 | 0.060000 | 0.180000 | 0.252000 | -2.525180 |
25% | 9.500000 | 14.250000 | 0.700000 | 0.787500 | 0.445000 | -0.386750 |
50% | 25.500000 | 27.000000 | 2.195000 | 2.375000 | 1.012000 | 0.497280 |
75% | 35.500000 | 36.500000 | 5.327500 | 5.580000 | 1.416000 | 1.261935 |
max | 48.500000 | 48.500000 | 58.320000 | 22.460000 | 2.023000 | 2.946470 |
Location Map¶
Figure below shows location map of cluster data with 140 samples. We assume the unit of measurements for coordinate system is kilometer. There are sevearl prefrential sampling that will affect the experimental variogram.
font = {'size' : 10}
matplotlib.rc('font', **font)
fig, axs = plt.subplots(figsize=(10, 5), dpi= 200, facecolor='w', edgecolor='k')
gs = gridspec.GridSpec(1, 2, width_ratios=[1.8, 1], wspace=0.19)
ax1=plt.subplot(gs[0])
plt.scatter(Cluster['Xlocation'], Cluster['Ylocation'],c=Cluster['Primary'],s=65,linewidths=1,edgecolors='k',
cmap='jet',vmin=0, vmax=14)
plt.gca().set_aspect('0.95')
plt.colorbar(shrink=0.8,label='Primary')
plt.xlabel('Easting (km)',fontsize='12')
plt.ylabel('Northing (km)',fontsize='12')
plt.title('Location Map of Primary',fontsize=12)
ax1=plt.subplot(gs[1])
plt.xlabel('Primary',fontsize='10')
plt.ylabel('Frequency',fontsize='10')
plt.hist(Cluster['Primary'], bins=15,ec='black')
# Calculate mean and variance of the variable
n=len(Cluster['Primary'])
Mean=np.mean(Cluster['Primary'])
SD=np.sqrt(np.var(Cluster['Primary']))
Max=np.amax(Cluster['Primary'])
Min=np.amin(Cluster['Primary'])
#plt.xlim(0,20)
plt.title('Histogram of Primary',fontsize=12)
plt.gca().set_aspect('0.5')
txt='n=%.0f \n $\mu$=%.3f \n $\sigma$=%.3f \n Max=%.1f \n Min=%.1f'
anchored_text = AnchoredText(txt %(n,Mean,SD,Max,Min), loc=1)
ax1.add_artist(anchored_text)
plt.show()
Variogram (Spatial Correlation) Calculation¶
#Experimental Variogram with Nromal Score and then fit a theoretical model
#Azimith 0: Experimental Variogram
tmin=-100;tmax=10000;xlag=2;xltol=1;nlag=14;azm=0;atol=20;bandwh=10;isill=1
Lag1, Var1, npp_az0=geostats.gamv(Cluster,'Xlocation', 'Ylocation','NS:Primary',tmin,
tmax,xlag,xltol,nlag,azm,atol,bandwh,isill)
#Azimith 0: Fitted variogram Model Parameters
nlag=100; xlag=0.3; azm=0
vario={'nst':2,'nug':0.01, 'cc1':0.44,'it1':1,'azi1':0,'hmaj1':10,'hmin1':10,
'cc2':0.55,'it2':1,'azi2':0,'hmaj2':20,'hmin2':20}
index_az0,Lag1_fit,Var1_fit,cov_az0,ro_az0=geostats.vmodel(nlag, xlag, azm, vario)
clear_output()
# Plot the calculated variogram
font = {'size' : 7}
matplotlib.rc('font', **font)
fig, axs = plt.subplots(figsize=(8,8), dpi= 200, facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .05, wspace=0.25)
#
#Label and Title of Variogram Plot
label1='Azimuth 0 ' ;symbol1='--or';label1_fit='Fitted Model ';symbol1_fit='-r'
label2='Azimuth 90 ';symbol2='--sb';label2_fit='Fitted Model ';symbol2_fit='-b'
xlabl='Distance (km)'; xlim=[0,20]; vlim=[0,1.38]; sill=1;loc=4
ax1=plt.subplot(1,2,1)
#Plot Variograms for Azimith 0
Lag1=np.delete(Lag1, np.where(Var1==0))
Var1=np.delete(Var1, np.where(Var1==0))
plt.plot(Lag1, Var1,symbol1,linewidth=1,markersize=4,label=label1)
plt.plot(Lag1_fit, Var1_fit,symbol1_fit,linewidth=1.2,label=label1_fit)
plt.legend(fontsize=8,loc=loc,frameon=False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().set_aspect('10')
plt.axhline(y=sill,linewidth=1,color='k',linestyle='--')
title='Variogram and Fitted Model_Azimuth 0'
plt.title(title,fontsize=10)
plt.xlabel(xlabl,fontsize=9)
ylabl='$\gamma (\mathbf{h}) $'
plt.ylabel(ylabl,rotation=0,fontsize=12,labelpad=15)
plt.xlim(float(xlim[0]),float(xlim[1]))
plt.ylim(float(vlim[0]),float(vlim[1]))
plt.show()
Decluster Variogram to Remove Prefrential Sampling Bias¶
# Call Vardec Fortran code in Python for variogram declustering
txt=" Parameters for Vardec \n"\
+" ***************************\n"\
+"START OF PARAMETERS: \n"\
+"Cluster.dat - file with data \n"\
+"1 2 0 - columns for X, Y, Z coordinates \n"\
+"6 - col number for variable \n"\
+"-1.0e21 1.0e21 - trimming limits \n"\
+"50 0.5 1 - nx,xmn,xsiz \n"\
+"50 0.5 1 - ny,ymn,ysiz \n"\
+"1 0.5 1 - nz,zmn,zsiz \n"\
+"2 - grid increment for global kriging, 1 considering all blocks \n"\
+"14 - number of lags \n"\
+"2 - lag separation distance \n"\
+"1 - lag tolerance \n"\
+"1 - number of directions \n"\
+"0 20 10.0 0.0 0.5 5.0 - azm,atol,bandh,dip,dtol,bandv \n"\
+"1 - standardize the experimental and declustered variogram \n"\
+"./Output/Vardec.out - output file for declustered variogram \n"\
+"./Output/vargplt.ps - output file for variogam ploting \n"\
+"./Output/varfit.var - variogram model for declustered variogram \n"\
+"2 0.01 - nst, nugget effect \n"\
+"1 0.44 0.0 0.0 0.0 - it,cc,ang1,ang2,ang3 \n"\
+" 10.0 10.0 10 - a_hmax,a_hmin,a_vert \n"\
+"1 0.55 0.0 0.0 0.0 - it,cc,ang1,ang2,ang3 \n"\
+" 20.0 20.0 10 - a_hmax,a_hmin,a_vert \n"
f1 = open('temp', "w");
f1.write(txt)
f1.close()
! echo temp | Vardec
! rm temp
clear_output()
# Reading output file of Vardec
Vardec=GSLIB_View('./Output/Vardec.out')
display(Vardec)
direction | lag-num | lag-dist | nparis | Exp-Variogram | GK-Variogram | Fit-Variogram | GK-Variance | fom_Variance | dof | chi-Variance | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 1.0000 | 118.0 | 0.1968 | 0.2232 | 0.1296 | 0.0012 | 0.0012 | 23.0 | 0.0012 |
1 | 1.0 | 2.0 | 1.5814 | 118.0 | 0.2853 | 0.2507 | 0.1983 | 0.0012 | 0.0027 | 22.0 | 0.0029 |
2 | 1.0 | 3.0 | 3.9897 | 118.0 | 0.3845 | 0.5351 | 0.4666 | 0.0092 | 0.0273 | 24.0 | 0.0148 |
3 | 1.0 | 4.0 | 5.7452 | 118.0 | 0.4048 | 0.5836 | 0.6336 | 0.0169 | 0.0619 | 33.0 | 0.0202 |
4 | 1.0 | 5.0 | 7.7636 | 118.0 | 0.6147 | 0.7275 | 0.7795 | 0.0186 | 0.0574 | 81.0 | 0.0129 |
5 | 1.0 | 6.0 | 9.7287 | 118.0 | 1.0001 | 0.8096 | 0.8586 | 0.0198 | 0.0881 | 94.0 | 0.0143 |
6 | 1.0 | 7.0 | 11.9079 | 118.0 | 1.3553 | 0.9952 | 0.8877 | 0.0188 | 0.0908 | 119.0 | 0.0131 |
7 | 1.0 | 8.0 | 13.9369 | 118.0 | 1.2165 | 0.9597 | 0.9111 | 0.0202 | 0.1220 | 110.0 | 0.0158 |
8 | 1.0 | 9.0 | 15.8035 | 118.0 | 1.2602 | 0.8904 | 0.9307 | 0.0178 | 0.1053 | 108.0 | 0.0173 |
9 | 1.0 | 10.0 | 17.9516 | 118.0 | 1.2524 | 0.9298 | 0.9509 | 0.0207 | 0.1164 | 117.0 | 0.0168 |
10 | 1.0 | 11.0 | 19.7612 | 118.0 | 0.8971 | 0.9494 | 0.9656 | 0.0226 | 0.1286 | 90.0 | 0.0222 |
11 | 1.0 | 12.0 | 21.8553 | 118.0 | 0.8907 | 0.7930 | 0.9797 | 0.0163 | 0.1387 | 129.0 | 0.0155 |
12 | 1.0 | 13.0 | 23.8492 | 118.0 | 1.4209 | 1.0928 | 0.9899 | 0.0150 | 0.1161 | 135.0 | 0.0148 |
13 | 1.0 | 14.0 | 25.8530 | 118.0 | 1.4094 | 1.0309 | 0.9968 | 0.0211 | 0.1550 | 108.0 | 0.0185 |
14 | 1.0 | 15.0 | 27.7955 | 118.0 | 1.8288 | 1.5716 | 0.9998 | 0.0204 | 0.1368 | 118.0 | 0.0169 |
# Plot declustered variogram
font = {'size' : 7}
matplotlib.rc('font', **font)
fig, axs = plt.subplots(figsize=(8,8), dpi= 200, facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .05, wspace=0.25)
ax1=plt.subplot(1,2,1)
xlim=[0,20]; vlim=[0,1.38]; sill=1; fsize=5; loc=4; outfile='vargplt_Declustered.png'
plt.axhline(y=sill,linewidth=0.6,color='k',linestyle='--')
plt.title(title,fontsize=fsize*1.5)
plt.xlabel(xlabl,fontsize=fsize*1.5)
ylabl='$\gamma (\mathbf{h}) $'
plt.ylabel(ylabl,rotation=0,fontsize=fsize*1.9,labelpad=15)
plt.xlim(float(xlim[0]),float(xlim[1]))
plt.ylim(float(vlim[0]),float(vlim[1]))
#
Lag1=Vardec['lag-dist']
Gamma1=Vardec['Exp-Variogram']
#
Lag2=Vardec['lag-dist']
Gamma2=Vardec['GK-Variogram']
#
Lagf3=Vardec['lag-dist']
Gammaf3=Vardec['Fit-Variogram']
symbol1= '-or'; symbol2= '-sb'; symbol3= '--b'; label1='Experimental Variogram'
label2='Declustered Variogram ';label3='Fitted Variogram (Declustered)'; xlabl='Distance (km)'
plt.gca().set_aspect('10')
plt.plot(Lag1, Gamma1,symbol1,linewidth=0.4,markersize=3,label=label1)
plt.plot(Lag2, Gamma2,symbol2,linewidth=1.5,markersize=4,label=label2)
plt.plot(Lagf3, Gammaf3,symbol3,linewidth=1.1,markersize=3,label=label3)
fsizet=fsize*1.3
plt.legend(fontsize=str(fsizet),loc=loc,frameon=False)
ax1.spines['left'].set_visible(True)
ax1.spines['bottom'].set_visible(True)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
The declustered variogram (blue line) improves the experimental variogram (red line); it removes bias due to prefrential sampling. The declustered variogram (blue line) is fit again (blue dashed line) and used for geostatistical modeling.
Kriging Estimation Using Declustered Variogram Model¶
# Call kt3d GSLIB code in Python for kriging estimation
txt=" Parameters for KT3D \n"\
+" ******************* \n"\
+"START OF PARAMETERS: \n"\
+"Cluster.dat -file with data \n"\
+"0 1 2 0 3 0 - columns for DH,X,Y,Z,var,sec var \n"\
+"-1.0e21 1.0e21 - trimming limits \n"\
+"0 -option: 0=grid, 1=cross, 2=jackknife \n"\
+"xvk.dat -file with jackknife data \n"\
+"1 2 0 3 0 - columns for X,Y,Z,vr and sec var \n"\
+"3 -debugging level: 0,1,2,3 \n"\
+"./Output/kt3d.dbg -file for debugging output \n"\
+"./Output/kt3d.out -file for kriged output \n"\
+"50 0.5 1 - nx,xmn,xsiz \n"\
+"50 0.5 1 - ny,ymn,ysiz \n"\
+"1 0.5 1 - nz,zmn,zsiz \n"\
+"1 1 1 -x,y and z block discretization \n"\
+"1 8 -min, max data for kriging \n"\
+"0 -max per octant (0-> not used) \n"\
+"20.0 20.0 1.0 -maximum search radii \n"\
+"0.0 0.0 0.0 -angles for search ellipsoid \n"\
+"1 0.613545 -0=SK,1=OK,2=non-st SK,3=exdrift \n"\
+"0 0 0 0 0 0 0 0 0 -drift: x,y,z,xx,yy,zz,xy,xz,zy \n"\
+"0 -0, variable; 1, estimate trend \n"\
+"./Output/extdrift.dat -gridded file with drift/mean \n"\
+"4 - column number in gridded file \n"\
+"2 0.010 - nst, nugget effect (Declustered Variogram Model) \n"\
+"1 0.715 0.0 0.0 0.0 - it,cc,ang1,ang2,ang3 (Declustered Variogram Model) \n"\
+" 10.170 10.170 10.170 - a_hmax,a_hmin,a_vert (Declustered Variogram Model) \n"\
+"1 0.275 0.0 0.0 0.0 - it,cc,ang1,ang2,ang3 (Declustered Variogram Model) \n"\
+" 28.412 28.412 28.412 - a_hmax,a_hmin,a_vert (Declustered Variogram Model) \n"
f1 = open('temp', "w");
f1.write(txt)
f1.close()
! echo temp | KT3D
! rm temp
clear_output()
# Plor kriging estimation
font = {'size' : 8}
matplotlib.rc('font', **font)
fig, axs = plt.subplots(figsize=(10, 3.5), dpi= 200, facecolor='w', edgecolor='k')
# Reading kriging file
txt= './Output/kt3d.out'
ax1=plt.subplot(1,2,1)
plt.scatter(Cluster['Xlocation'], Cluster['Ylocation'],c=Cluster['Primary'],s=25,linewidths=1,edgecolors='k',
cmap='jet',vmin=0, vmax=20)
nx=50; xsiz=1; xmin=0.5; ny=50; ysiz=1; ymin=0.5
xa=np.zeros(nx*ny)
ya=np.zeros(nx*ny)
Var1a=np.zeros((ny,nx))
nxy=nx*ny
iz=0
with open(txt) as f:
vt1=[]
for i in range(4):
next(f)
for line in f:
p = line.split()
vt1.append(float(p[0]))
varkk=[]
kk=0
for j in range(ny):
for i in range(nx):
xa[kk]=xmin+(i)*xsiz
ya[kk]=ymin+(j)*ysiz
index1=i+1 + (j-1+1)*nx
Var1a[j,i]=vt1[index1-1]
kk=kk+1
plt.title('Kriging Estimation with Declustered Variogram',fontsize=8)
im = plt.imshow(Var1a, extent=(xa.min(), xa.max(), ya.min(), ya.max()), cmap='jet',
origin='lower',vmin=0,vmax=10,aspect=0.9)
plt.colorbar(shrink=0.8,label='Primary')
plt.xlabel('Easting (km)',fontsize='9')
plt.ylabel('Northing (km)',fontsize='9')
plt.show()
Appendix¶
# Python function to plot variogram
def vargplt(varex1,label1,symbol1,varfit1,label2,symbol2,varex2,label3,symbol3,varfit2,label4,symbol4,
title,xlabl,xlim,vlim,sill,fsize,location,outfile,**kwargs):
''' This is to plot experimental variogram and fitted model:
varex1 = file for first experimental variogram
label1 = label of first variogram
symbol1 = symbol of first variogram
varfit1 = fitted variogram of first variogram
varex2 = file for second experimental variogram
label2 = label of second variogram
symbol2 = symbol of second variogram
varfit2 = fitted variogram of second variogram
title = title of plot
xlabl = distance
xlim = limitation of x axis
vlim = limitation of variogram
sill = sill of the variogram
fsize = general font size of plot
loc = location of legend on the plot
outfile = output file
'''
fig=plt.figure(figsize=(3, 2
), dpi= 200, facecolor='w', edgecolor='k')
font = {'size' : 6}
matplotlib.rc('font', **font)
f1=os.path.exists(varex1)
f2=os.path.exists(varfit1)
f3=os.path.exists(varex2)
f4=os.path.exists(varfit2)
plt.axhline(y=sill,linewidth=0.6,color='k',linestyle='--')
plt.title(title,fontsize=fsize*1.5)
plt.xlabel(xlabl,fontsize=fsize*1.5)
ylabl='$\gamma (\mathbf{h}) $'
plt.ylabel(ylabl,rotation=0,fontsize=fsize*1.9,labelpad=15)
plt.xlim(float(xlim[0]),float(xlim[1]))
plt.ylim(float(vlim[0]),float(vlim[1]))
if(f1):
with open(varex1) as f:
Lag=[]
Gamma=[]
for i in range(1):
next(f)
for line in f:
p = line.split()
Lag.append(float(p[1]))
Gamma.append(float(p[2]))
Lag=[float(i) for i in Lag]
Gamma=[float(i) for i in Gamma]
if(f2):
with open(varfit1) as f:
Lagf=[]
Gammaf=[]
for i in range(1):
next(f)
for line in f:
p = line.split()
Lagf.append(float(p[1]))
Gammaf.append(float(p[2]))
Lagf=[float(i) for i in Lagf]
Gammaf=[float(i) for i in Gammaf]
if(f3):
with open(varex2) as f:
Lag2=[]
Gamma2=[]
for i in range(1):
next(f)
for line in f:
p = line.split()
Lag2.append(float(p[1]))
Gamma2.append(float(p[2]))
Lag2=[float(i) for i in Lag2]
Gamma2=[float(i) for i in Gamma2]
if(f4):
with open(varfit2) as f:
Lagf4=[]
Gammaf4=[]
for i in range(1):
next(f)
for line in f:
p = line.split()
Lagf4.append(float(p[1]))
Gammaf4.append(float(p[2]))
Lagf4=[float(i) for i in Lagf4]
Gammaf4=[float(i) for i in Gammaf4]
if(symbol1==None): symbol1= '-or'
if(symbol2==None): symbol2= '-b'
if(symbol3==None): symbol1= '-or'
if(symbol4==None): symbol2= '-b'
if(f1):
plt.plot(Lag, Gamma,symbol1,linewidth=0.4,markersize=1.5,label=label1)
if(f2):
plt.plot(Lagf, Gammaf,symbol2,linewidth=0.8,markersize=1.5,label=label2)
if(f3):
plt.plot(Lag2, Gamma2,symbol3,linewidth=0.4,markersize=1.5,label=label3)
if(f4):
plt.plot(Lagf4, Gammaf4,symbol4,linewidth=0.8,markersize=1.5,label=label4)
fsizet=fsize*1.2
plt.legend(fontsize=str(fsizet),loc=location,frameon=False)
plt.savefig(outfile, bbox_inches='tight')
#
def GSLIB_View(File):
"""a fucntion to convert ASCII flat file
compatible with Geo-EAS to Pandas Data Frame"""
with open(File, 'r') as f:
next(f)
txt=[]
tmp_val=[]
value=[]
ii=0
for line in f:
if(ii==0):
p = line.split()
no=int(p[0])
elif ii<=no:
p = line.replace('\n', '').split()
txt.append(p[0])
else:
tmp=[]
p = line.split()
for j in range(len(txt)):
tmp.append(float(p[j]))
tmp_val.append(tmp)
ii+=1
for j in range(len(txt)):
tmp=[]
for k in range(len(tmp_val)):
tmp.append(tmp_val[k][j])
value.append(tmp)
f.close()
pd_=["" for x in range(len(txt))]
pd_=[]
for j in range(len(txt)):
pd_.append(pd.DataFrame(({txt[j]:value[j]})))
result = pd.concat(pd_, axis=1)
return result