本文整理汇总了Python中statsmodels.formula.api.ols函数的典型用法代码示例。如果您正苦于以下问题:Python ols函数的具体用法?Python ols怎么用?Python ols使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ols函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: regression
def regression(self):
print self.people.head(n=1)
self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"
self.logfile.write( "\n\n Sum Temp Interest NegBinom")
m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
self.logfile.write( "\n AIC"+str(+m.aic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
self.logfile.write( "\n\n Sum Temp Interest OLS")
m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
self.logfile.write( "\n AIC"+str(+m.aic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
self.logfile.write( "\n\n Pos Temp Interest NegBinom")
m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
#lim_people = self.people[self.people.timePosInterest>0]
self.logfile.write( "\n\n Pos Temp Interest OLS")
m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
开发者ID:clauwag,项目名称:WikipediaGenderInequality,代码行数:33,代码来源:GoogleTrendAnalyzerJSON.py
示例2: model_formulas
def model_formulas():
''' Define models through formulas '''
# Get the dta
data = read_csv(r'..\Data\data_kaplan\swim100m.csv')
# Different models
model1 = ols("time ~ sex", data).fit() # one factor
model2 = ols("time ~ sex + year", data).fit() # two factors
model3 = ols("time ~ sex * year", data).fit() # two factors with interaction
# Model information
print((model1.summary()))
print((model2.summary()))
print((model3.summary()))
# ANOVAs
print('-----------------------------------------------------------------')
print((anova_lm(model1)))
print('-----------------------------------------------------------------')
print((anova_lm(model2)))
print('-----------------------------------------------------------------')
model3Results = anova_lm(model3)
print(model3Results)
# Just to check the correct run
return model3Results['F'][0] # should be 156.1407931415788
开发者ID:nsonnad,项目名称:statsintro,代码行数:28,代码来源:modeling.py
示例3: RunModels
def RunModels(live):
"""Runs regressions that predict birth weight.
live: DataFrame of pregnancy records
"""
columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']
header = ['isfirst', 'agepreg', 'agepreg2']
rows = []
formula = 'totalwgt_lb ~ isfirst'
results = smf.ols(formula, data=live).fit()
rows.append(FormatRow(results, columns))
print(formula)
SummarizeResults(results)
formula = 'totalwgt_lb ~ agepreg'
results = smf.ols(formula, data=live).fit()
rows.append(FormatRow(results, columns))
print(formula)
SummarizeResults(results)
formula = 'totalwgt_lb ~ isfirst + agepreg'
results = smf.ols(formula, data=live).fit()
rows.append(FormatRow(results, columns))
print(formula)
SummarizeResults(results)
live['agepreg2'] = live.agepreg**2
formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'
results = smf.ols(formula, data=live).fit()
rows.append(FormatRow(results, columns))
print(formula)
SummarizeResults(results)
PrintTabular(rows, header)
开发者ID:13tsuyoshi,项目名称:ThinkStats2,代码行数:35,代码来源:regression.py
示例4: main
def main():
teams = pd.read_csv('../data/Teams.csv')
teams = teams[teams['yearID'] >= 1985]
teams = teams[['yearID', 'teamID', 'Rank', 'R', 'RA', 'G', 'W', 'H', 'BB', 'HBP', 'AB', 'SF', 'HR', '2B', '3B']]
teams = teams.set_index(['yearID', 'teamID'])
salaries = pd.read_csv('../data/Salaries.csv')
salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum()
teams = teams.join(salaries_by_yearID_teamID)
plot_spending_wins(teams, 2001)
teams['BA'] = teams['H']/teams['AB']
teams['OBP'] = (teams['H'] + teams['BB'] + teams['HBP']) / (teams['AB'] + teams['BB'] + teams['HBP'] + teams['SF'])
teams['SLG'] = (teams['H'] + teams['2B'] + (2*teams['3B']) + (3*teams['HR'])) / teams['AB']
#First Model
runs_reg_model1 = sm.ols("R~OBP+SLG+BA",teams)
runs_reg1 = runs_reg_model1.fit()
#Second Model
runs_reg_model2 = sm.ols("R~OBP+SLG",teams)
runs_reg2 = runs_reg_model2.fit()
#Third Model
runs_reg_model3 = sm.ols("R~BA",teams)
runs_reg3 = runs_reg_model3.fit()
print runs_reg1.summary()
print runs_reg2.summary()
print runs_reg3.summary()
开发者ID:adilmoujahid,项目名称:Sabermetrics-intro,代码行数:35,代码来源:sabermetrics-intro.py
示例5: multiple_linear_regression
def multiple_linear_regression():
'''Multiple linear regression
chapter 6.3, p. 98'''
# get the data from the web
inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
df = get_data(inFile)
# do the fit, for the original model ...
model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
print model.summary()
print anova_lm(model)
# as GLM
glm = glm('carbohydrate ~ age + weight + protein',
family=Gaussian(), data=df).fit()
print 'Same model, calculated with GLM'
''' The confidence intervals are different than those from OLS.
The reason (from Nathaniel Smith):
OLS uses a method that gives exact results, but only works in the special
case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
instead uses an approximate method which is correct asymptotically but may
be off for small samples; the tradeoff you get in return is that this method
works the same way for all GLM models, including those with non-Gaussian
error terms and non-trivial link functions. So that's why they're different.
'''
print glm.summary()
# ... and for model 1
model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
print model1.summary()
print anova_lm(model1)
开发者ID:HunterAllman,项目名称:kod,代码行数:33,代码来源:code.py
示例6: run_anova
def run_anova(self):
ps_table_for_anova = self.ps_table[self.ps_table['Area'].isin(self.params.anova_areas)]
#ps_lm = mixedlm('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova, groups=ps_table_for_anova['Subject']).fit()
ps_lm = ols('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova).fit()
anova = anova_lm(ps_lm)
self.pass_object('fvalue_rf', anova['F'].values[0:3])
self.pass_object('pvalue_rf', anova['PR(>F)'].values[0:3])
ps_table_for_anova_low = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([10,25])]
print 'nsamples =', len(ps_table_for_anova_low)
ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_low).fit()
anova = anova_lm(ps_lm)
self.pass_object('fvalue_rd_low', anova['F'].values[0:3])
self.pass_object('pvalue_rd_low', anova['PR(>F)'].values[0:3])
ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_low).fit()
anova = anova_lm(ps_lm)
self.pass_object('fvalue_ra_low', anova['F'].values[0:3])
self.pass_object('pvalue_ra_low', anova['PR(>F)'].values[0:3])
ps_table_for_anova_high = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([100,200])]
print 'nsamples =', len(ps_table_for_anova_high)
ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_high).fit()
anova = anova_lm(ps_lm)
self.pass_object('fvalue_rd_high', anova['F'].values[0:3])
self.pass_object('pvalue_rd_high', anova['PR(>F)'].values[0:3])
ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_high).fit()
anova = anova_lm(ps_lm)
self.pass_object('fvalue_ra_high', anova['F'].values[0:3])
self.pass_object('pvalue_ra_high', anova['PR(>F)'].values[0:3])
开发者ID:maciekswat,项目名称:ram_utils,代码行数:34,代码来源:RunAnalysis.py
示例7: test_statsmodels
def test_statsmodels():
statsmodels = import_module('statsmodels') # noqa
import statsmodels.api as sm
import statsmodels.formula.api as smf
df = sm.datasets.get_rdataset("Guerry", "HistData").data
smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit()
开发者ID:evectant,项目名称:pandas,代码行数:7,代码来源:test_downstream.py
示例8: fit_model
def fit_model(self, model=None, verbose=False):
if model is None:
# model = "Mach*B_field*Driving*Temperature" ## Full
model = "M+B+k+T+M:B+M:T+B:T" #Fractional
for i,(stat, vec, last) in enumerate(zip(self.statistics, \
self.respvecs, self.laststep_respvecs)):
self.model_matrix["resp"] = Series(vec, index=self.model_matrix.index)
self.model_matrix["laststep_resp"] = Series(last, index=self.model_matrix.index)
fcn_model = sm.ols("".join(["resp~",model]), data=self.model_matrix)
laststep_model = sm.ols("".join(["laststep_resp~",model]), data=self.model_matrix)
results = fcn_model.fit()
laststep_results = laststep_model.fit()
self.fitparam.append(results.params[1:])
self.laststep_fitparam.append(laststep_results.params[1:])
if i==0:
self.paramnames = fcn_model.exog_names[1:] # Set the names of the coefficients
if verbose:
print "Fits for "+ stat
print results.summary()
print laststep_results.summary()
return self
开发者ID:Astroua,项目名称:AstroStat_Results,代码行数:29,代码来源:lenth.py
示例9: backsel
def backsel(df, response, alpha = 0.1):
'''
Performs backward selection for regression.
args:
df = data frame with response and covariates
alpha = a float indicating confidence level
response = string that represents the response variable
e.g. 'Y'
attributes:
summary = ols(formula,data).fit().summary()
'''
# initial assignments
covariates = set(df.columns)
covariates.remove(response)
formula = '{} ~ {}'.format(response,' + '.join(list(covariates)))
while True:
pvals = ols(formula,df).fit().pvalues
candidates = pvals[pvals > alpha]
if candidates.empty:
break
dropvar = candidates[candidates == max(candidates)].index[0]
covariates.remove(dropvar)
formula = '{} ~ {}'.format(response,' + '.join(list(covariates)))
print 'The optimal model is {}'.format(formula)
return ols(formula,df).fit().summary()
开发者ID:jcapitz,项目名称:LinearRegression,代码行数:32,代码来源:mselector.py
示例10: model_formulas
def model_formulas():
''' Define models through formulas '''
# Get the data:
# Development of world record times for the 100m Freestyle, for men and women.
data = pd.read_csv('swim100m.csv')
# Different models
model1 = ols("time ~ sex", data).fit() # one factor
model2 = ols("time ~ sex + year", data).fit() # two factors
model3 = ols("time ~ sex * year", data).fit() # two factors with interaction
# Model information
print((model1.summary()))
print((model2.summary()))
print((model3.summary()))
# ANOVAs
print('----------------- Results ANOVAs: Model 1 -----------------------')
print((anova_lm(model1)))
print('--------------------- Model 2 -----------------------------------')
print((anova_lm(model2)))
print('--------------------- Model 3 -----------------------------------')
model3Results = anova_lm(model3)
print(model3Results)
# Just to check the correct run
return model3Results['F'][0] # should be 156.1407931415788
开发者ID:ChengduoZhao,项目名称:statsintro_python,代码行数:30,代码来源:ISP_simpleModels.py
示例11: test_patsy_lazy_dict
def test_patsy_lazy_dict():
class LazyDict(dict):
def __init__(self, data):
self.data = data
def __missing__(self, key):
return np.array(self.data[key])
data = cpunish.load_pandas().data
data = LazyDict(data)
res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()
res2 = res.predict(data)
npt.assert_allclose(res.fittedvalues, res2)
data = cpunish.load_pandas().data
data['INCOME'].loc[0] = None
data = LazyDict(data)
data.index = cpunish.load_pandas().data.index
res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()
res2 = res.predict(data)
assert_equal(res.fittedvalues, res2) # Should lose a record
assert_equal(len(res2) + 1, len(cpunish.load_pandas().data))
开发者ID:bashtage,项目名称:statsmodels,代码行数:25,代码来源:test_formula.py
示例12: run_regressions
def run_regressions(mydf):
print "\n************ Regression Results ************\n"
#run a very simple regression to estimate effects of regressors on outcome
results = smf.ols('dollars_per_day ~ \
C(week_day_name_posted) + day_posted + C(region) + maleness + \
treat_cost + patient_age:smile_scale + \
patient_age + smile_scale', data=mydf).fit()
print results.summary()
#smile scale is negative but lacks statistical signficance
# model after dropping insignificant terms (backwards selection process)
results = smf.ols('dollars_per_day ~ \
weekend_post + treat_cost + patient_age + smile_scale', data=mydf).fit()
print results.summary()
#smile scale is negative with p-val<.1
# run with smile categories (do not treat as linear relationship)
mydf = pd.read_csv(towrite_path)
bins = [0, .45, .55, 1]
smile_cat_names = ["negative","neutral","positive"]
smile_dums = pd.get_dummies(pd.cut(mydf.smile_scale, bins, labels=smile_cat_names))
mydf = pd.merge(mydf,smile_dums,left_index=True,right_index=True)
results = smf.ols('dollars_per_day ~ \
treat_cost + patient_age + \
weekend_post + negative + positive', data=mydf).fit()
print results.summary()
开发者ID:cotterman,项目名称:Astro250_Python-for-Data-Scientists,代码行数:30,代码来源:Watsi_data_analytics.py
示例13: partial_correlation
def partial_correlation(df, x, y, measures):
'''
A little (but hopefully quite useful) piece of code that calculates
the partial correlation between x and y while covarying for the
remaining measures in a list of measures.
It requires a data frame, the names of x and y, and a list of measures
(that don't need to, but can, contain x or y)
This function returns r and p values
'''
# Import the modules you need
from scipy.stats import pearsonr
from statsmodels.formula.api import ols
# Your covars are all the measures you've selected
# that aren't x and y
covars = [ z for z in measures if not z == x and not z == y ]
# Your formulae just set x and y to be a function
# of all the other covariates
formula_x = x + ' ~ ' + ' + '.join(covars)
formula_y = y + ' ~ ' + ' + '.join(covars)
# Fit both of these formulae
lm_x = ols(formula_x, df).fit()
lm_y = ols(formula_y, df).fit()
# Save the residuals from the model
res_x = lm_x.resid
res_y = lm_y.resid
r, p = pearsonr(res_x, res_y)
return r, p
开发者ID:KirstieJane,项目名称:DESCRIBING_DATA,代码行数:35,代码来源:create_correlation_matrix.py
示例14: run_regressions
def run_regressions(data, formulas):
"""
Run len(formulas) regressions on the clustered data.
arguments:
data -- Dataset, a dataset with the cdb field initialized to
a DataFrame containing clusters and dep.vars.
formulas -- a list of strings of the type 'dep_var ~ ex_var + ...'"
see statsmodels documentation for details.
returns:
a list of RegressionResults objects each one containing the results of
one regression model. See statsmodels documentation for additional info.
"""
results = []
# We need to create an additional dataset for the fragility dep.var.
# because scores from some countries are missing (marked as 'NA')
# if we feed the statsmodels.ols function data with nas, it throws
# errors.
c_frag = data[data['fragility'] != 'NA']
c_frag[['fragility']] = c_frag['fragility'].astype(float)
for f in formulas:
if 'fragility' in f:
r = sm.ols(formula=f, data=c_frag).fit()
else:
r = sm.ols(formula=f, data=data).fit()
results.append(r)
return results
开发者ID:marcomorucci,项目名称:Clustering-Constitutions,代码行数:31,代码来源:analyze.py
示例15: linear_foward_selection
def linear_foward_selection(X_train, y_train):
'''
forward selection of optimize adjusted R-squared by adding features that help
the most one at a time until the score goes down or you run out of features
not implemeneted yet. would only make sense for a linear model. not for categorical
data presently not called from within module.
'''
remaining = {X_train.columns}
remaining.remove(response)
selected = []
current_score, best_new_score = 0.0, 0.0
while remaining and current_score == best_new_score:
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
score = smf.ols(formula, data).fit().rsquared_adj
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop()
if current_score < best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
formula = "{} ~ {} + 1".format(response,
' + '.join(selected))
model = smf.ols(formula, data).fit()
return model
开发者ID:Sandy4321,项目名称:meatball_stats,代码行数:27,代码来源:meatball.py
示例16: prepare_data
def prepare_data(subdata,gdp_gr,gdp_per_capita_2013,pop_multiplier,pop_affected,endyear):
def calccost(pop_exposed,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear):
cost=pop_exposed*gdp_per_capita_2013*(1+gdp_gr)**(endyear-2013)*pop_multiplier
return cost
subdata.ix[notnull(subdata['share']),'cost']=calccost(pop_affected,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear)
subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])
### predicts missing water level data points
formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions"
olsmodel=sm.ols(formula,data=subdata).fit()
predictions=olsmodel.predict(subdata)
subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]
formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions"
olsmodel2=sm.ols(formula,data=subdata).fit()
res2=olsmodel2.params
predictions2=olsmodel2.predict(subdata)
subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]
### predicts damages based on a few points using water level
subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
subdata['log{}'.format(varin2)]=np.log(subdata[varin2])
formula="costlog ~ log{}".format(varin1)
damagemodel=sm.ols(formula,data=subdata).fit()
predicted_damages=damagemodel.predict(subdata)
subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
subdata['costestimated']=np.exp(subdata['costlog'])
return subdata
开发者ID:julierozenberg,项目名称:create_files_for_mike11,代码行数:32,代码来源:analyze_results_final.py
示例17: _do_analysis_no_cross_validation
def _do_analysis_no_cross_validation(self):
"""
Find the best model (fit) and create self.list_of_fits and self.fit
"""
self.list_of_fits = []
# first model is just the mean
self.list_of_fits.append(fm.ols(formula="Q('{}') ~ 1".format(self.endog), data=self.df).fit())
# try to improve the model until no improvements can be found
all_exog = self.list_of_exog[:]
while all_exog:
# try each x in all_exog and overwrite the best_fit if we find a better one
# the first best_fit is the one from the previous round
best_fit = deepcopy(self.list_of_fits[-1])
for x in all_exog:
# make new_fit, compare with best found so far
formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
fit = fm.ols(formula=formula, data=self.df).fit()
best_fit = self.find_best_bic([best_fit, fit])
# Sometimes, the obtained fit may be better, but contains unsignificant parameters.
# Correct the fit by removing the unsignificant parameters and estimate again
best_fit = self._prune(best_fit, p_max=self.p_max)
# if best_fit does not contain more variables than last fit in self.list_of_fits, exit
if best_fit.model.formula in self.list_of_fits[-1].model.formula:
break
else:
self.list_of_fits.append(best_fit)
all_exog.remove(x)
self.fit = self.list_of_fits[-1]
开发者ID:kdebrab,项目名称:opengrid,代码行数:32,代码来源:regression.py
示例18: anova_interaction
def anova_interaction(data_lastDV):
"""
Two-way ANOVA and interaction analysis of given data
http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html
Note: 2way ANOVAs are for 2+ categorical independent/causal variables, with 2+ levels each
:param data: data frame containing the independent variables in first two columns, dependent in the third
:return: None
"""
col_names = data_lastDV.columns.values # get the columns' names
factor_groups = data_lastDV[col_names].dropna()
if len(col_names) < 3:
print("ERROR in statsMOOC.py: Not enough columns in dataframe to do interaction analysis: " + len(col_names))
# two-way anova
formula = col_names[2] + " ~ C(" + col_names[0] + ") + C(" + col_names[1] + ")"
formula_interaction = formula.replace('+', '*')
interaction_lm = ols(formula, data=factor_groups).fit() # linear model
print(interaction_lm.summary())
print(FORMAT_LINE)
print("- " + col_names[2] + " = " + col_names[0] + " * " + col_names[1] + " Interaction -")
print(anova_lm(ols(formula_interaction, data=factor_groups).fit(), interaction_lm))
print(FORMAT_LINE)
print("- " + col_names[2] + " = " + col_names[0] + " + " + col_names[1] + " ANOVA -")
print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[0] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
print(FORMAT_LINE)
print("- " + col_names[2] + " = " + col_names[1] + " + " + col_names[0] + " ANOVA -")
print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[1] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
开发者ID:UberHowley,项目名称:spoc-file-processing,代码行数:32,代码来源:statsSPOC.py
示例19: prepare_data
def prepare_data(subdata):
subdata.ix[notnull(subdata['share']),'cost']=subdata.ix[notnull(subdata['share']),'Pop']
subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])
### predicts missing water level data points
formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions"
olsmodel=sm.ols(formula,data=subdata).fit()
predictions=olsmodel.predict(subdata)
subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]
formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions"
olsmodel2=sm.ols(formula,data=subdata).fit()
res2=olsmodel2.params
predictions2=olsmodel2.predict(subdata)
subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]
### predicts damages based on a few points using water level
subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
subdata['log{}'.format(varin2)]=np.log(subdata[varin2])
formula="costlog ~ log{}".format(varin1)
damagemodel=sm.ols(formula,data=subdata).fit()
predicted_damages=damagemodel.predict(subdata)
subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
subdata['popestimated']=np.exp(subdata['costlog'])
return subdata
开发者ID:julierozenberg,项目名称:create_files_for_mike11,代码行数:28,代码来源:analyze_results_functions.py
示例20: _do_analysis_cross_validation
def _do_analysis_cross_validation(self):
"""
Find the best model (fit) based on cross-valiation (leave one out)
"""
assert len(self.df) < 15, "Cross-validation is not implemented if your sample contains more than 15 datapoints"
# initialization: first model is the mean, but compute cv correctly.
errors = []
formula = "Q('{}') ~ 1".format(self.endog)
for i in self.df.index:
# make new_fit, compute cross-validation and store error
df_ = self.df.drop(i, axis=0)
fit = fm.ols(formula=formula, data=df_).fit()
cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])
self.list_of_fits = [fm.ols(formula=formula, data=self.df).fit()]
self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]
# try to improve the model until no improvements can be found
all_exog = self.list_of_exog[:]
while all_exog:
# import pdb;pdb.set_trace()
# try each x in all_exog and overwrite if we find a better one
# at the end of iteration (and not earlier), save the best of the iteration
better_model_found = False
best = dict(fit=self.list_of_fits[-1], cverror=self.list_of_cverrors[-1])
for x in all_exog:
formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
# cross_validation, currently only implemented for monthly data
# compute the mean error for a given formula based on leave-one-out.
errors = []
for i in self.df.index:
# make new_fit, compute cross-validation and store error
df_ = self.df.drop(i, axis=0)
fit = fm.ols(formula=formula, data=df_).fit()
cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])
cverror = np.mean(np.abs(np.array(errors)))
# compare the model with the current fit
if cverror < best['cverror']:
# better model, keep it
# first, reidentify using all the datapoints
best['fit'] = fm.ols(formula=formula, data=self.df).fit()
best['cverror'] = cverror
better_model_found = True
if better_model_found:
self.list_of_fits.append(best['fit'])
self.list_of_cverrors.append(best['cverror'])
else:
# if we did not find a better model, exit
break
# next iteration with the found exog removed
all_exog.remove(x)
self.fit = self.list_of_fits[-1]
开发者ID:kdebrab,项目名称:opengrid,代码行数:59,代码来源:regression.py
注:本文中的statsmodels.formula.api.ols函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论