• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python api.ols函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中statsmodels.formula.api.ols函数的典型用法代码示例。如果您正苦于以下问题:Python ols函数的具体用法?Python ols怎么用?Python ols使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了ols函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: regression

    def regression(self):

        print self.people.head(n=1)
        self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"


        self.logfile.write( "\n\n Sum Temp Interest NegBinom")
        m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        self.logfile.write( "\n\n Sum Temp Interest OLS")
        m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        self.logfile.write( "\n\n Pos Temp Interest NegBinom")
        m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        #lim_people = self.people[self.people.timePosInterest>0]
        self.logfile.write( "\n\n Pos Temp Interest OLS")
        m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())
开发者ID:clauwag,项目名称:WikipediaGenderInequality,代码行数:33,代码来源:GoogleTrendAnalyzerJSON.py


示例2: model_formulas

def model_formulas():
    ''' Define models through formulas '''
    # Get the dta
    data = read_csv(r'..\Data\data_kaplan\swim100m.csv')
    
    # Different models
    model1 = ols("time ~ sex", data).fit()  # one factor
    model2 = ols("time ~ sex + year", data).fit()   # two factors
    model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
    
    # Model information
    print((model1.summary()))
    print((model2.summary()))
    print((model3.summary()))
    
    # ANOVAs
    print('-----------------------------------------------------------------')
    print((anova_lm(model1)))
    
    print('-----------------------------------------------------------------')
    print((anova_lm(model2)))
    
    print('-----------------------------------------------------------------')
    model3Results = anova_lm(model3)
    print(model3Results)
    
    # Just to check the correct run
    return model3Results['F'][0] # should be 156.1407931415788
开发者ID:nsonnad,项目名称:statsintro,代码行数:28,代码来源:modeling.py


示例3: RunModels

def RunModels(live):
    """Runs regressions that predict birth weight.

    live: DataFrame of pregnancy records
    """
    columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']
    header = ['isfirst', 'agepreg', 'agepreg2']

    rows = []
    formula = 'totalwgt_lb ~ isfirst'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)

    formula = 'totalwgt_lb ~ agepreg'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    formula = 'totalwgt_lb ~ isfirst + agepreg'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    live['agepreg2'] = live.agepreg**2
    formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    PrintTabular(rows, header)
开发者ID:13tsuyoshi,项目名称:ThinkStats2,代码行数:35,代码来源:regression.py


示例4: main

def main():	

	teams = pd.read_csv('../data/Teams.csv')

	teams = teams[teams['yearID'] >= 1985]
	teams = teams[['yearID', 'teamID', 'Rank', 'R', 'RA', 'G', 'W', 'H', 'BB', 'HBP', 'AB', 'SF', 'HR', '2B', '3B']]

	teams = teams.set_index(['yearID', 'teamID'])

	salaries = pd.read_csv('../data/Salaries.csv')

	salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum()

	teams = teams.join(salaries_by_yearID_teamID)

	plot_spending_wins(teams, 2001)

	teams['BA'] = teams['H']/teams['AB']
	teams['OBP'] = (teams['H'] + teams['BB'] + teams['HBP']) / (teams['AB'] + teams['BB'] + teams['HBP'] + teams['SF'])
	teams['SLG'] = (teams['H'] + teams['2B'] + (2*teams['3B']) + (3*teams['HR'])) / teams['AB']

	#First Model
	runs_reg_model1 = sm.ols("R~OBP+SLG+BA",teams)
	runs_reg1 = runs_reg_model1.fit()
	#Second Model
	runs_reg_model2 = sm.ols("R~OBP+SLG",teams)
	runs_reg2 = runs_reg_model2.fit()
	#Third Model
	runs_reg_model3 = sm.ols("R~BA",teams)
	runs_reg3 = runs_reg_model3.fit()


	print runs_reg1.summary()
	print runs_reg2.summary()
	print runs_reg3.summary()
开发者ID:adilmoujahid,项目名称:Sabermetrics-intro,代码行数:35,代码来源:sabermetrics-intro.py


示例5: multiple_linear_regression

def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)
    
    # do the fit, for the original model ...
    model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print model.summary()
    print anova_lm(model)

    # as GLM
    glm = glm('carbohydrate ~ age + weight + protein',
            family=Gaussian(), data=df).fit()
    print 'Same model, calculated with GLM'
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print glm.summary()
    
    # ... and for model 1
    model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
    print model1.summary()
    print anova_lm(model1)    
开发者ID:HunterAllman,项目名称:kod,代码行数:33,代码来源:code.py


示例6: run_anova

    def run_anova(self):
        ps_table_for_anova = self.ps_table[self.ps_table['Area'].isin(self.params.anova_areas)]

        #ps_lm = mixedlm('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova, groups=ps_table_for_anova['Subject']).fit()
        ps_lm = ols('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rf', anova['F'].values[0:3])
        self.pass_object('pvalue_rf', anova['PR(>F)'].values[0:3])

        ps_table_for_anova_low = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([10,25])]
        print 'nsamples =', len(ps_table_for_anova_low)

        ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_low).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rd_low', anova['F'].values[0:3])
        self.pass_object('pvalue_rd_low', anova['PR(>F)'].values[0:3])

        ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_low).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_ra_low', anova['F'].values[0:3])
        self.pass_object('pvalue_ra_low', anova['PR(>F)'].values[0:3])

        ps_table_for_anova_high = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([100,200])]
        print 'nsamples =', len(ps_table_for_anova_high)

        ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_high).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rd_high', anova['F'].values[0:3])
        self.pass_object('pvalue_rd_high', anova['PR(>F)'].values[0:3])

        ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_high).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_ra_high', anova['F'].values[0:3])
        self.pass_object('pvalue_ra_high', anova['PR(>F)'].values[0:3])
开发者ID:maciekswat,项目名称:ram_utils,代码行数:34,代码来源:RunAnalysis.py


示例7: test_statsmodels

def test_statsmodels():

    statsmodels = import_module('statsmodels')  # noqa
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    df = sm.datasets.get_rdataset("Guerry", "HistData").data
    smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit()
开发者ID:evectant,项目名称:pandas,代码行数:7,代码来源:test_downstream.py


示例8: fit_model

    def fit_model(self, model=None, verbose=False):

        if model is None:
            # model = "Mach*B_field*Driving*Temperature" ## Full
            model = "M+B+k+T+M:B+M:T+B:T" #Fractional

        for i,(stat, vec, last) in enumerate(zip(self.statistics, \
                        self.respvecs, self.laststep_respvecs)):

            self.model_matrix["resp"] = Series(vec, index=self.model_matrix.index)
            self.model_matrix["laststep_resp"] = Series(last, index=self.model_matrix.index)

            fcn_model = sm.ols("".join(["resp~",model]), data=self.model_matrix)
            laststep_model = sm.ols("".join(["laststep_resp~",model]), data=self.model_matrix)

            results = fcn_model.fit()
            laststep_results = laststep_model.fit()

            self.fitparam.append(results.params[1:])
            self.laststep_fitparam.append(laststep_results.params[1:])

            if i==0:
                self.paramnames = fcn_model.exog_names[1:] # Set the names of the coefficients

            if verbose:
                print "Fits for "+ stat
                print results.summary()
                print laststep_results.summary()
        return self
开发者ID:Astroua,项目名称:AstroStat_Results,代码行数:29,代码来源:lenth.py


示例9: backsel

def backsel(df, response, alpha = 0.1):
    '''
    Performs backward selection for regression.
    args:
        df = data frame with response and covariates
        alpha = a float indicating confidence level
        response = string that represents the response variable
            e.g. 'Y'
    attributes:
        summary = ols(formula,data).fit().summary()
    '''
    # initial assignments
    covariates = set(df.columns)
    covariates.remove(response)
    formula = '{} ~ {}'.format(response,' + '.join(list(covariates)))
    
    while True:
        
        pvals = ols(formula,df).fit().pvalues
        candidates = pvals[pvals > alpha]
        
        if candidates.empty:
            break
            
        dropvar = candidates[candidates == max(candidates)].index[0]
        covariates.remove(dropvar)
        
        formula = '{} ~ {}'.format(response,' + '.join(list(covariates)))
    
    print 'The optimal model is {}'.format(formula)
    
    return ols(formula,df).fit().summary()
开发者ID:jcapitz,项目名称:LinearRegression,代码行数:32,代码来源:mselector.py


示例10: model_formulas

def model_formulas():
    ''' Define models through formulas '''
    
    # Get the data:
    # Development of world record times for the 100m Freestyle, for men and women.
    data = pd.read_csv('swim100m.csv')
    
    # Different models
    model1 = ols("time ~ sex", data).fit()  # one factor
    model2 = ols("time ~ sex + year", data).fit()   # two factors
    model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
    
    # Model information
    print((model1.summary()))
    print((model2.summary()))
    print((model3.summary()))
    
    # ANOVAs
    print('----------------- Results ANOVAs: Model 1 -----------------------')
    print((anova_lm(model1)))
    
    print('--------------------- Model 2 -----------------------------------')
    print((anova_lm(model2)))
    
    print('--------------------- Model 3 -----------------------------------')
    model3Results = anova_lm(model3)
    print(model3Results)
    
    # Just to check the correct run
    return model3Results['F'][0] # should be 156.1407931415788
开发者ID:ChengduoZhao,项目名称:statsintro_python,代码行数:30,代码来源:ISP_simpleModels.py


示例11: test_patsy_lazy_dict

def test_patsy_lazy_dict():
    class LazyDict(dict):
        def __init__(self, data):
            self.data = data

        def __missing__(self, key):
            return np.array(self.data[key])

    data = cpunish.load_pandas().data
    data = LazyDict(data)
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()

    res2 = res.predict(data)
    npt.assert_allclose(res.fittedvalues, res2)

    data = cpunish.load_pandas().data
    data['INCOME'].loc[0] = None

    data = LazyDict(data)
    data.index = cpunish.load_pandas().data.index
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()

    res2 = res.predict(data)
    assert_equal(res.fittedvalues, res2)  # Should lose a record
    assert_equal(len(res2) + 1, len(cpunish.load_pandas().data))
开发者ID:bashtage,项目名称:statsmodels,代码行数:25,代码来源:test_formula.py


示例12: run_regressions

def run_regressions(mydf):

    print "\n************ Regression Results ************\n"

    #run a very simple regression to estimate effects of regressors on outcome
    results = smf.ols('dollars_per_day ~ \
                      C(week_day_name_posted) + day_posted + C(region) + maleness + \
                      treat_cost + patient_age:smile_scale + \
                        patient_age + smile_scale', data=mydf).fit()
    print results.summary()
    #smile scale is negative but lacks statistical signficance


    # model after dropping insignificant terms (backwards selection process)
    results = smf.ols('dollars_per_day ~ \
                      weekend_post + treat_cost + patient_age + smile_scale', data=mydf).fit()
    print results.summary()
    #smile scale is negative with p-val<.1

    
    # run with smile categories (do not treat as linear relationship)
    mydf = pd.read_csv(towrite_path)
    bins = [0, .45, .55, 1]
    smile_cat_names = ["negative","neutral","positive"]
    smile_dums = pd.get_dummies(pd.cut(mydf.smile_scale, bins, labels=smile_cat_names))
    mydf = pd.merge(mydf,smile_dums,left_index=True,right_index=True)
    results = smf.ols('dollars_per_day ~ \
                      treat_cost + patient_age + \
                      weekend_post + negative + positive', data=mydf).fit()
    print results.summary() 
开发者ID:cotterman,项目名称:Astro250_Python-for-Data-Scientists,代码行数:30,代码来源:Watsi_data_analytics.py


示例13: partial_correlation

def partial_correlation(df, x, y, measures):
    '''
    A little (but hopefully quite useful) piece of code that calculates
    the partial correlation between x and y while covarying for the
    remaining measures in a list of measures.
    
    It requires a data frame, the names of x and y, and a list of measures
    (that don't need to, but can, contain x or y)
    
    This function returns r and p values
    '''
    # Import the modules you need
    from scipy.stats import pearsonr
    from statsmodels.formula.api import ols

    # Your covars are all the measures you've selected
    # that aren't x and y
    covars = [ z for z in measures if not z == x and not z == y ]
                                
    # Your formulae just set x and y to be a function
    # of all the other covariates
    formula_x = x + ' ~ ' + ' + '.join(covars)
    formula_y = y + ' ~ ' + ' + '.join(covars)

    # Fit both of these formulae
    lm_x = ols(formula_x, df).fit()
    lm_y = ols(formula_y, df).fit()
        
    # Save the residuals from the model
    res_x = lm_x.resid
    res_y = lm_y.resid
            
    r, p = pearsonr(res_x, res_y)
    
    return r, p
开发者ID:KirstieJane,项目名称:DESCRIBING_DATA,代码行数:35,代码来源:create_correlation_matrix.py


示例14: run_regressions

def run_regressions(data, formulas):
    """
    Run len(formulas) regressions on the clustered data.

    arguments:
    data -- Dataset, a dataset with the cdb field initialized to
            a DataFrame containing clusters and dep.vars.
    formulas --  a list of strings of the type 'dep_var ~ ex_var + ...'"
                 see statsmodels documentation for details.

    returns:
    a list of RegressionResults objects each one containing the results of
    one regression model. See statsmodels documentation for additional info.
    """
    results = []

    # We need to create an additional dataset for the fragility dep.var.
    # because scores from some countries are missing (marked as 'NA')
    # if we feed the statsmodels.ols function data with nas, it throws
    # errors.
    c_frag = data[data['fragility'] != 'NA']
    c_frag[['fragility']] = c_frag['fragility'].astype(float)

    for f in formulas:
        if 'fragility' in f:
            r = sm.ols(formula=f, data=c_frag).fit()
        else:
            r = sm.ols(formula=f, data=data).fit()
        results.append(r)

    return results
开发者ID:marcomorucci,项目名称:Clustering-Constitutions,代码行数:31,代码来源:analyze.py


示例15: linear_foward_selection

def linear_foward_selection(X_train, y_train):
    '''
    forward selection of optimize adjusted R-squared by adding features that help
    the most one at a time until the score goes down or you run out of features
    not implemeneted yet. would only make sense for a linear model. not for categorical
    data presently not called from within module.
    '''
    remaining = {X_train.columns}
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model
开发者ID:Sandy4321,项目名称:meatball_stats,代码行数:27,代码来源:meatball.py


示例16: prepare_data

def prepare_data(subdata,gdp_gr,gdp_per_capita_2013,pop_multiplier,pop_affected,endyear):

	def calccost(pop_exposed,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear):
		cost=pop_exposed*gdp_per_capita_2013*(1+gdp_gr)**(endyear-2013)*pop_multiplier
		return cost

	subdata.ix[notnull(subdata['share']),'cost']=calccost(pop_affected,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear)
	subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])

	### predicts missing water level data points

	formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel=sm.ols(formula,data=subdata).fit()
	predictions=olsmodel.predict(subdata)
	subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]

	formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel2=sm.ols(formula,data=subdata).fit()
	res2=olsmodel2.params
	predictions2=olsmodel2.predict(subdata)
	subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]

	### predicts damages based on a few points using water level
	subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
	subdata['log{}'.format(varin2)]=np.log(subdata[varin2])

	formula="costlog ~ log{}".format(varin1)
	damagemodel=sm.ols(formula,data=subdata).fit()
	predicted_damages=damagemodel.predict(subdata)
	subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
	subdata['costestimated']=np.exp(subdata['costlog'])
	return subdata
开发者ID:julierozenberg,项目名称:create_files_for_mike11,代码行数:32,代码来源:analyze_results_final.py


示例17: _do_analysis_no_cross_validation

    def _do_analysis_no_cross_validation(self):
        """
        Find the best model (fit) and create self.list_of_fits and self.fit

        """

        self.list_of_fits = []
        # first model is just the mean
        self.list_of_fits.append(fm.ols(formula="Q('{}') ~ 1".format(self.endog), data=self.df).fit())
        # try to improve the model until no improvements can be found
        all_exog = self.list_of_exog[:]
        while all_exog:
            # try each x in all_exog and overwrite the best_fit if we find a better one
            # the first best_fit is the one from the previous round
            best_fit = deepcopy(self.list_of_fits[-1])
            for x in all_exog:
                # make new_fit, compare with best found so far
                formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
                fit = fm.ols(formula=formula, data=self.df).fit()
                best_fit = self.find_best_bic([best_fit, fit])

            # Sometimes, the obtained fit may be better, but contains unsignificant parameters.
            # Correct the fit by removing the unsignificant parameters and estimate again
            best_fit = self._prune(best_fit, p_max=self.p_max)

            # if best_fit does not contain more variables than last fit in self.list_of_fits, exit
            if best_fit.model.formula in self.list_of_fits[-1].model.formula:
                break
            else:
                self.list_of_fits.append(best_fit)
                all_exog.remove(x)
        self.fit = self.list_of_fits[-1]
开发者ID:kdebrab,项目名称:opengrid,代码行数:32,代码来源:regression.py


示例18: anova_interaction

def anova_interaction(data_lastDV):
    """
    Two-way ANOVA and interaction analysis of given data
    http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html

    Note: 2way ANOVAs are for 2+ categorical independent/causal variables, with 2+ levels each
    :param data: data frame containing the independent variables in first two columns, dependent in the third
    :return: None
    """

    col_names = data_lastDV.columns.values  # get the columns' names
    factor_groups = data_lastDV[col_names].dropna()
    if len(col_names) < 3:
        print("ERROR in statsMOOC.py: Not enough columns in dataframe to do interaction analysis: " + len(col_names))

    # two-way anova
    formula = col_names[2] + " ~ C(" + col_names[0] + ") + C(" + col_names[1] + ")"
    formula_interaction = formula.replace('+', '*')
    interaction_lm = ols(formula, data=factor_groups).fit()  # linear model
    print(interaction_lm.summary())

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[0] + " * " + col_names[1] + " Interaction -")
    print(anova_lm(ols(formula_interaction, data=factor_groups).fit(), interaction_lm))

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[0] + " + " + col_names[1] + " ANOVA -")
    print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[0] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[1] + " + " + col_names[0] + " ANOVA -")
    print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[1] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
开发者ID:UberHowley,项目名称:spoc-file-processing,代码行数:32,代码来源:statsSPOC.py


示例19: prepare_data

def prepare_data(subdata):

	subdata.ix[notnull(subdata['share']),'cost']=subdata.ix[notnull(subdata['share']),'Pop']
	subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])

	### predicts missing water level data points

	formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel=sm.ols(formula,data=subdata).fit()
	predictions=olsmodel.predict(subdata)
	subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]

	formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel2=sm.ols(formula,data=subdata).fit()
	res2=olsmodel2.params
	predictions2=olsmodel2.predict(subdata)
	subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]

	### predicts damages based on a few points using water level
	subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
	subdata['log{}'.format(varin2)]=np.log(subdata[varin2])

	formula="costlog ~ log{}".format(varin1)
	damagemodel=sm.ols(formula,data=subdata).fit()
	predicted_damages=damagemodel.predict(subdata)
	subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
	subdata['popestimated']=np.exp(subdata['costlog'])
	return subdata
开发者ID:julierozenberg,项目名称:create_files_for_mike11,代码行数:28,代码来源:analyze_results_functions.py


示例20: _do_analysis_cross_validation

    def _do_analysis_cross_validation(self):
        """
        Find the best model (fit) based on cross-valiation (leave one out)

        """
        assert len(self.df) < 15, "Cross-validation is not implemented if your sample contains more than 15 datapoints"

        # initialization: first model is the mean, but compute cv correctly.
        errors = []
        formula = "Q('{}') ~ 1".format(self.endog)
        for i in self.df.index:
            # make new_fit, compute cross-validation and store error
            df_ = self.df.drop(i, axis=0)
            fit = fm.ols(formula=formula, data=df_).fit()
            cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
            errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])

        self.list_of_fits = [fm.ols(formula=formula, data=self.df).fit()]
        self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]

        # try to improve the model until no improvements can be found
        all_exog = self.list_of_exog[:]
        while all_exog:
            # import pdb;pdb.set_trace()
            # try each x in all_exog and overwrite if we find a better one
            # at the end of iteration (and not earlier), save the best of the iteration
            better_model_found = False
            best = dict(fit=self.list_of_fits[-1], cverror=self.list_of_cverrors[-1])
            for x in all_exog:
                formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
                # cross_validation, currently only implemented for monthly data
                # compute the mean error for a given formula based on leave-one-out.
                errors = []
                for i in self.df.index:
                    # make new_fit, compute cross-validation and store error
                    df_ = self.df.drop(i, axis=0)
                    fit = fm.ols(formula=formula, data=df_).fit()
                    cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
                    errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])
                cverror = np.mean(np.abs(np.array(errors)))
                # compare the model with the current fit
                if cverror < best['cverror']:
                    # better model, keep it
                    # first, reidentify using all the datapoints
                    best['fit'] = fm.ols(formula=formula, data=self.df).fit()
                    best['cverror'] = cverror
                    better_model_found = True

            if better_model_found:
                self.list_of_fits.append(best['fit'])
                self.list_of_cverrors.append(best['cverror'])
            else:
                # if we did not find a better model, exit
                break

            # next iteration with the found exog removed
            all_exog.remove(x)

        self.fit = self.list_of_fits[-1]
开发者ID:kdebrab,项目名称:opengrid,代码行数:59,代码来源:regression.py



注:本文中的statsmodels.formula.api.ols函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python generalized_estimating_equations.GEE类代码示例发布时间:2022-05-27
下一篇:
Python api.logit函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap