本文整理汇总了Python中wsd.database.MySQLDatabase类的典型用法代码示例。如果您正苦于以下问题:Python MySQLDatabase类的具体用法?Python MySQLDatabase怎么用?Python MySQLDatabase使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MySQLDatabase类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: table_parser
def table_parser(self, file_name, root):
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_build_view = db.get_build_view()
cursor = db_build_view._cursor
# setup logging
LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
LOGGING_PATH = 'tmp/tableclasses-dbinsert.log'
logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')
html_parser = WikipediaHTMLTableParser()
zip_file_path = os.path.join(root, file_name)
html = self.zip2html(zip_file_path)
html_parser.feed(html.decode('utf-8'))
source_article_id = file_name.split('_')[1]
try:
fed_parser = WikipediaFedTextParser(html_parser.get_data())
table_classes = fed_parser.table_classes(None)
table_classes = list(set(table_classes))
for table_class in table_classes:
self.insert_table_class(source_article_id, table_class, cursor)
except KeyError:
db_build_view._db_connection.rollback()
logging.error('KeyError FedTextParser source article id: %s ' % source_article_id)
db_build_view.commit()
db_build_view.reset_cache()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:28,代码来源:tableclassinserter.py
示例2: req
def req():
# Get URLs from a text file, remove white space.
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
articles = db_worker_view.retrieve_all_articles()
#articles = db_worker_view.retrieve_all_articles_questionmark()
# measure time
start = time.clock()
start_time_iteration = start
iteration_number = 483
for i, article in enumerate(articles):
# print some progress
if i % 10000 == 0:
#print time for the iteration
seconds = time.clock() - start_time_iteration
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print "Number of crawled articles: %d. Total time for last iteration of 10000 articles: %d:%02d:%02d" % (i, h, m, s)
start_time_iteration = time.clock()
iteration_number += 1
# Thread pool.
# Blocks other threads (more than the set limit).
pool.acquire(blocking=True)
# Create a new thread.
# Pass each URL (i.e. u parameter) to the worker function.
t = threading.Thread(target=worker, args=(MEDIAWIKI_API_ENDPOINT+urllib.quote(article['title'])+'/'+str(article['rev_id']), article, iteration_number))
# Start the newly create thread.
t.start()
seconds = time.clock() - start
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
print "Total time: %d:%02d:%02d" % (h, m, s)
开发者ID:linksuccess,项目名称:linksuccess,代码行数:34,代码来源:crawler.py
示例3: _evaluate_disambiguations
def _evaluate_disambiguations(self):
INPUT_FILE = self.read_path('Please enter the path of the samples file [.xml]', default='./tmp/samples.xml')
LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/evaluation3.log', must_exist=False)
CONTINUE = self.read_yes_no('This process might take from several minutes to several hours.\nDo you want to continue?')
if not CONTINUE:
print '# Aborting...'
return
print '# Starting evaluation...'
# setup logging
LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')
# connecting to db
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
work_view = db.get_work_view()
# measure time
start = time.clock()
evaluator = Evaluator(INPUT_FILE, work_view)
result = evaluator.evaluate_disambiguations()
seconds = round (time.clock() - start)
print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
print 'Evaluation done! - precision: %d%%, recall: %d%%' % (round(result['precision']*100), round(result['recall']*100))
开发者ID:plaufer,项目名称:wikiwsd,代码行数:28,代码来源:evaluator.py
示例4: build_links_position_table
def build_links_position_table():
"""creates up the basic database structure
"""
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
connection = db._create_connection()
cursor = connection.cursor()
cursor.execute('CREATE TABLE `redirects_candidates` ('
'`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'`source_article_id` BIGINT UNSIGNED NOT NULL,'
'`target_article_id` BIGINT UNSIGNED NULL,'
'`target_article_name` VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
' target_position_in_text INT UNSIGNED NOT NULL,'
' target_position_in_text_only INT UNSIGNED,'
' target_position_in_section INT UNSIGNED,'
' target_position_in_section_in_text_only INT UNSIGNED,'
' section_name VARCHAR(1000) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
' section_number INT UNSIGNED,'
' target_position_in_table INT UNSIGNED,'
' table_number INT UNSIGNED,'
' table_css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
' table_css_style VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin,'
' target_x_coord_1920_1080 INT UNSIGNED DEFAULT NULL,'
' target_y_coord_1920_1080 INT UNSIGNED DEFAULT NULL ,'
'INDEX(`target_article_id`),'
'INDEX(`source_article_id`)'
') ENGINE=InnoDB;')
connection.close()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:28,代码来源:startredirectsinserter.py
示例5: pickle_vis_data_pandas
def pickle_vis_data_pandas():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn)
print len(df)
no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
print len(no_dup)
feature = no_dup.loc[no_dup['visual_region']=='lead']
print len(feature)
feature.reset_index(inplace=True)
feature = no_dup.loc[no_dup['visual_region']=='infobox']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False)
feature = no_dup.loc[no_dup['visual_region']=='navbox']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False)
feature = no_dup.loc[no_dup['visual_region']=='left-body']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False)
feature = no_dup.loc[no_dup['visual_region']=='body']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:35,代码来源:pickle_data.py
示例6: run
def run(self):
self.print_title('This is the interactive runner program')
self.create_tmp_if_not_exists()
INPUT_FILE = self.read_path('Please enter the path of the input file [.txt]', default='./tmp/input.txt')
OUTPUT_FILE = self.read_path('Please enter the path of the output file [.html]', default='./tmp/output.html', must_exist=False)
LOGGING_PATH = self.read_path('Please enter the path of the logging file [.log]', default='./tmp/runner.log', must_exist=False)
print '# Starting runner...'
# setup logging
LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')
# measure time
start = time.clock()
# connect to db
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
work_view = db.get_work_view()
# read input
f = open(INPUT_FILE, 'r')
text = f.read()
text = text.replace(' ', ' ')
f.close()
# create dummy article
article = {}
article['type'] = 'article'
article['id'] = None
article['title'] = None
article['text'] = text
article['links'] = []
# identify links
link_detector = LinkDetector(work_view)
link_detector.detect_links(article)
# identify terms
#term_identifier = TermIdentifier()
#article = term_identifier.identify_terms(text)
# find possible meanings
meaning_finder = MeaningFinder(work_view)
meaning_finder.find_meanings(article)
# calculate relatedness
relatedness_calculator = RelatednessCalculator(work_view)
# decide for meaning
decider = Decider(relatedness_calculator)
decider.decide(article)
# output results
html_outputter = HTMLOutputter()
html_outputter.output(article, OUTPUT_FILE)
seconds = round (time.clock() - start)
print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
开发者ID:plaufer,项目名称:wikiwsd,代码行数:59,代码来源:runner.py
示例7: pickle_correlations_zeros
def pickle_correlations_zeros():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
print 'read'
df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn)
print 'group'
article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index()
print 'write to file'
article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:10,代码来源:weighted_pagerank.py
示例8: __init__
def __init__(self, path):
#os.environ["DISPLAY"]=":1"
print path
os.environ["DISPLAY"]=":1"
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
self.db_build_view = db.get_build_view()
self.cursor = self.db_build_view._cursor
self.app = QApplication(sys.argv)
self.path = path
开发者ID:trovdimi,项目名称:wikilinks,代码行数:10,代码来源:redirectscandidatespostioninserter.py
示例9: pickle_aggregated_counts_distribution
def pickle_aggregated_counts_distribution():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
cursor = db_worker_view._cursor
results = {}
try:
cursor.execute('select sum(counts) from clickstream_derived_internal_links group by prev_id;')
result = cursor.fetchall()
results['source_article']=result
except MySQLdb.Error, e:
print e
开发者ID:linksuccess,项目名称:linksuccess,代码行数:12,代码来源:click_distributions.py
示例10: build_page_length_table
def build_page_length_table():
"""creates up the basic database structure
"""
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
connection = db._create_connection()
cursor = connection.cursor()
cursor.execute('CREATE TABLE `redirects_candidates_page_length` ('
'`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,'
' page_length_1920_1080 INT UNSIGNED DEFAULT NULL'
') ENGINE=InnoDB;')
connection.close()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:12,代码来源:startredirectsinserter.py
示例11: _create_structure
def _create_structure(self):
# measure time
start = time.clock()
# creating structure
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db.build()
seconds = round (time.clock() - start)
logging.info('Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60))
print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
开发者ID:linksuccess,项目名称:linksuccess,代码行数:12,代码来源:builder.py
示例12: pickle_category_counts_distribution
def pickle_category_counts_distribution():
results = {}
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
cursor = db_worker_view._cursor
for category in ['lead', 'infobox', 'body', 'left-body', 'navbox']:
try:
cursor.execute('select counts from link_features where counts is not null and visual_region=%s;', (category,))
result = cursor.fetchall()
results[category] = result
except MySQLdb.Error, e:
print e
开发者ID:linksuccess,项目名称:linksuccess,代码行数:12,代码来源:click_distributions.py
示例13: correlations
def correlations(network_name):
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
# wikipedia graph structural statistics
results = None
try:
results = cursor.execute('select c.curr_id, sum(c.counts) as counts from clickstream_derived c where c.link_type_derived= %s group by c.curr_id;', ("internal-link",))
results = cursor.fetchall()
except MySQLdb.Error, e:
print ('error retrieving xy coord for all links links %s (%d)' % (e.args[1], e.args[0]))
开发者ID:trovdimi,项目名称:wikilinks,代码行数:14,代码来源:weighted_pagerank.py
示例14: links_heatmap
def links_heatmap():
#http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
# Get URLs from a text file, remove white space.
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
coords = db_worker_view.retrieve_all_links_coords()
print 'coord loaded'
x=[]
y=[]
page_lenghts = db_worker_view.retrieve_all_page_lengths()
print 'lenghts loaded'
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(page_lenghts[coord['source_article_id']])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
fig_size = (2.4, 2)
#fig_size = (3.5, 3)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap, extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Log Normalized")
plt.show()
plt.savefig('output/links_heatmap_lognormed_self_loop.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(),cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Links Heatmap Normalized")
plt.show()
plt.savefig('output/links_heatmap_normed_self_loop.pdf')
print "done"
开发者ID:linksuccess,项目名称:linksuccess,代码行数:50,代码来源:heatmaps.py
示例15: build_table
def build_table():
"""creates up the basic database structure
"""
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
connection = db._create_connection()
cursor = connection.cursor()
cursor.execute('CREATE TABLE `table_css_class` ('
'`id` BIGINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,'
'`source_article_id` BIGINT UNSIGNED NOT NULL,'
' css_class VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,'
'INDEX(`source_article_id`)'
') ENGINE=InnoDB;')
connection.close()
开发者ID:trovdimi,项目名称:wikilinks,代码行数:14,代码来源:tableclassinserter.py
示例16: pickle_redirects_ids
def pickle_redirects_ids():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_work_view = db.get_work_view()
redirects_list_id = []
with open(HOME+"data/candidate_articles.tsv") as f:
next(f)
for line in f:
line = line.strip().split('\t')
#look up id
tmp = db_work_view.resolve_title(line[0].replace('_',' '))
#print tmp
if tmp is not None:
redirects_list_id.append(tmp['id'])
pickle.dump(redirects_list_id, open(SSD_HOME+"pickle/redirects_ids.obj", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:15,代码来源:redirects_candidates.py
示例17: rbo
def rbo():
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
cursor = conn.cursor()
sm = []
try:
cursor.execute('select curr_id, sum(counts) as counts_sum, curr_title from clickstream_derived where link_type_derived=%s group by curr_id order by counts_sum desc limit 10000;', ("entry-sm",))
result = cursor.fetchall()
for row in result:
record = {}
record['curr_id']= row[0]
record['counts_sum'] = row[1]
record['curr_title'] = row[2]
sm.append(row[0])
except MySQLdb.Error, e:
print e
开发者ID:trovdimi,项目名称:wikilinks,代码行数:17,代码来源:rbo.py
示例18: clicks_heatmap_total
def clicks_heatmap_total():
print 'loading'
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
db_worker_view = db.get_work_view()
coords = db_worker_view.retrieve_all_links_coords_clicks()
print 'coord loaded'
links = {}
x = []
y = []
values = []
for coord in coords:
x_normed = float(coord['x'])/float(1920)
y_normed = float(coord['y'])/float(coord['page_length'])
if x_normed <=1.0 and y_normed <=1.0:
x.append(x_normed)
y.append(y_normed)
values.append(float(coord['counts']))
heatmap, xedges, yedges = np.histogram2d(x, y, bins=100, weights=values)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0] ]
fig_size = (2.4, 2)
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=LogNorm(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Clicks Heatmap Log Normalized")
plt.show()
plt.savefig('output/clicks_heatmap_lognormed_self_loop_total.pdf')
plt.clf()
plt.figure(figsize=fig_size)
plt.grid(True)
plt.imshow(heatmap , extent=extent, origin='upper', norm=Normalize(), cmap=plt.get_cmap('jet'))
plt.colorbar()
#plt.title("Clicks Heatmap Normalized")
plt.show()
plt.savefig('output/clicks_heatmap_normed_self_loop_total.pdf')
print "done"
开发者ID:linksuccess,项目名称:linksuccess,代码行数:46,代码来源:heatmaps.py
示例19: pickle_correlations_zeros_january
def pickle_correlations_zeros_january():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
print 'read'
df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
print 'loaded links'
df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn)
print 'loaded counts'
result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
print 'merged counts'
print result
article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
article_counts['counts'].fillna(0.0, inplace=True)
print article_counts
print 'write to file'
article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
开发者ID:trovdimi,项目名称:wikilinks,代码行数:17,代码来源:weighted_pagerank.py
示例20: _extract_articles
def _extract_articles(self):
INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]')
#INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]')
MAX_ARTICLES_IN_QUEUE = 200#self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000)
NUM_THREADS = 1#self.read_number('How many threads shall be used to write to the database?', 20, 1, 50)
CONTINUE = True#self.read_yes_no('This process might take several days to finish.\nDo you want to continue?')
if CONTINUE:
# measure time
start = time.clock()
# connect to database and create article queue
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE)
# create reader and threads
reader = WikipediaReader(INPUT_FILE, queue, extract_text=False)
threads = []
for i in range(0, NUM_THREADS):
inserter = ArticleInserter(queue, db.get_build_view())
threads.append(inserter)
# start reader
reader.start()
# start insert threads
for thread in threads:
thread.start()
# wait for reading thread, queue and inserters to be done
reader.join()
queue.join()
for thread in threads:
thread.end()
for thread in threads:
thread.join()
seconds = round (time.clock() - start)
print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)
else:
print 'Aborting...'
开发者ID:linksuccess,项目名称:linksuccess,代码行数:44,代码来源:builder.py
注:本文中的wsd.database.MySQLDatabase类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论