Користувач:DixonD/Statscode
Перейти до навігації
Перейти до пошуку
- getList1000.py — отримання списку 1000 необхідних
- getListWikitable.py — отримання списку з вікітаблиць
- stats.py — головний код обчислення статистики
- upload.py — оновлення сторінки
- checkCategory.py — перевірка категорії 1000 необхідних на відповідність зі списком
# -*- coding: utf-8 -*-
import sys
sys.path.append('..\..\PyWikipediaBot')
import wikipedia
article_name = 'List of articles every Wikipedia should have'
meta_wiki = wikipedia.Site('meta', 'meta')
meta_page = wikipedia.Page(meta_wiki, article_name)
article = meta_page.get(get_redirect=False)
f = open('list.txt', 'w')
count = 0
grand_total = 0
name_last = 0
name_first = article.find(u'[[en:', name_last)
while name_first > -1:
name_mid = article.find(u'|', name_first)
cat_start =article.rfind(u'\n== ', name_last, name_first)
if cat_start > -1:
cat_end = article.find(u'==',cat_start+3, name_first)
if cat_end > -1:
cat = article[cat_start+3:cat_end]
print
print cat
print ''.center(len(cat),'-')
count = 0
name_last = article.find(u']]', name_first)
if name_last > name_mid:
name_last = name_mid
article_item = article[name_first+5:name_last]
f.write((u'[[en:' + article_item + u']]').encode("utf_8"))
f.write('\n')
count += 1
grand_total += 1
print count, article_item.encode("utf_8")
name_first = article.find(u'[[en:', name_last)
f.close()
print ''
print 'GRAND TOTAL'
print '-----------'
print grand_total, 'articles'
# -*- coding: utf-8 -*-
import re
import sys
sys.path.append('..\..\PyWikipediaBot')
import wikipedia
article_name = u'uk:Вікіпедія:Проект:Популярність/Популярні'
#article_name = u'uk:Вікіпедія:Проект:Популярність/2010'
article = wikipedia.Page(None, article_name).get()
f = open('list.txt', 'w')
linkPattern = re.compile(ur'\[\[(.+?)(?:\]\]|\||#)')
wikitablePattern = re.compile(ur'\{\|.*?\|\}', re.MULTILINE | re.DOTALL)
count = 0
for wikitable in wikitablePattern.findall(article):
for link in linkPattern.findall(wikitable):
count += 1
title = wikipedia.Page(None, link).title(asLink=True, forceInterwiki=True)
print count, title
f.write(title.encode("utf_8"))
f.write('\n')
f.close()
print ''
print 'GRAND TOTAL'
print '-----------'
print count, 'articles'
# -*- coding: utf-8 -*-
import sys
sys.path.append('..\..\PyWikipediaBot')
import pagegenerators
import wikipedia
import datetime
import re
#configuration
preloadNumber = 125
languageWeights = {'uk':1.3, 'en':1.0, 'ru':1.4}
languages = ['uk', 'en', 'ru']
languageNames = { 'uk':u'Українська', 'en':u'Англійська', 'ru':u'Російська'}
localLanguage = 'uk'
#outputPage = u'Вікіпедія:Проект:Популярність/2010/stat'
outputPage = u'Вікіпедія:Проект:Популярність/Популярні/stat'
#outputPage = u'Вікіпедія:Статті, які повинні бути в усіх вікіпедіях/stat'
class Translations():
def __init__(self, pages = [], languages=None):
if languages is None:
languages = [page.site().language() for page in pages]
# remove duplicates
uniqueLanguages = {}
self.languages = [uniqueLanguages.setdefault(l,l) for l in languages if l not in uniqueLanguages]
self.pages = {}
for page in pages:
self.addPage(page)
def __iter__(self):
for language in self.languages:
yield self.getPage(language)
def addPage(self, page):
language = page.site().language()
if language in self.languages:
self.pages[language] = page
def getPage(self, language):
return self.pages[language] if language in self.pages else None
class TranslationsList():
def __init__(self, pages, languages=None, preloadNumber = 125):
self.preloadNumber = preloadNumber
for page in pages:
self.removeSection(page)
if languages is None:
languages = [page.site().language() for page in pages]
# remove duplicates
uniqueLanguages = {}
self.languages = [uniqueLanguages.setdefault(l,l) for l in languages if l not in uniqueLanguages]
self.numbering = dict((pages[i].urlname(), i) for i in range(len(pages)))
self.list = [Translations(languages=languages) for i in range(len(pages))]
# pre-load pages
allInterwiki = []
pages = [self.removeSection(page) for page in self.preloadPages(pages)]
for page in pages:
number = self.getNumber(page)
self.list[number].addPage(page)
if page.exists():
interwikis = [self.removeSection(interwiki) for interwiki in page.interwiki() if interwiki.site().language() in self.languages]
for interwiki in interwikis:
self.setNumber(interwiki, number)
allInterwiki.append(interwiki)
for page in self.preloadPages(allInterwiki):
self.list[self.getNumber(page)].addPage(page)
def __iter__(self):
for translations in self.list:
yield translations
def removeSection(self, page):
page._section = None
return page
def getNumber(self, page):
key = page.urlname()
return self.numbering[key] if key in self.numbering else -1
def setNumber(self, page, number):
self.numbering[page.urlname()] = number
def preloadPages(self, pages, resolveRedirects=True, preserveOrdering = True):
# do not load pages that have been loaded already
result = [page for page in pages if hasattr(page, '_contents')]
result.extend([page for page in pagegenerators.PreloadingGenerator([page for page in pages if not hasattr(page, '_contents')], pageNumber=self.preloadNumber)])
if resolveRedirects:
possibleRedirects = list(result)
result = []
while len(possibleRedirects):
gen = self.preloadPages(possibleRedirects, resolveRedirects=False, preserveOrdering = False)
possibleRedirects = []
for page in gen:
if page.isRedirectPage():
target = page.getRedirectTarget()
self.setNumber(target, self.getNumber(page))
possibleRedirects.append(target)
else:
result.append(page)
if preserveOrdering:
result.sort(key=self.getNumber)
return result
def getLanguages(self):
return self.languages
class StatisticsCalculator():
def __init__(self, translationList, languageNames = {}, languageWeights = {}):
self.translationList = translationList
self.languages = self.translationList.getLanguages()
self.languageNames = languageNames
self.languageWeights = languageWeights
self.stats = [self.StatisticsForTranslations(translations) for translations in self.translationList]
for i in range(len(self.stats)):
self.stats[i]['number'] = i+1
self.statsPerLanguage = self.CalculateStatsPerLanguage()
def StatisticsForPage(self, page):
if page is None or not page.exists():
return {'char_count': 0, 'clear_size': 0, 'type': 'absent'}
text = page.get().replace('\r\n', '\n')
articleCharCount = self.GetCharCount(text)
interwikiLength = self.GetInterwikiLength(text)
commentsLength = self.GetCommentLength(text)
clearArticleSize = (articleCharCount - interwikiLength - commentsLength)*self.languageWeights[page.site().language()]
articleType = self.GetArticleType(clearArticleSize)
return {'char_count': articleCharCount, 'clear_size': clearArticleSize, 'type': articleType}
def StatisticsForTranslations(self, translations):
result = {'translations':translations}
for language in self.languages:
result[language] = self.StatisticsForPage(translations.getPage(language))
return result
def CalculateStatsPerLanguage(self):
overallStats = dict((language, {
'total_size': 0,
'absent': 0,
'stubs': 0,
'articles': 0,
'longarticles': 0,
}) for language in self.languages)
for language in self.languages:
for stat in self.stats:
articleType = stat[language]['type']
overallStats[language][articleType] += 1
overallStats[language]['total_size'] += stat[language]['clear_size']
absent = overallStats[language]['absent']
stubs = overallStats[language]['stubs']
articles = overallStats[language]['articles']
longarticles = overallStats[language]['longarticles']
totalCount = absent + stubs + articles + longarticles
overallStats[language]['total_count'] = totalCount
overallStats[language]['rating'] = self.GetScore(totalCount, absent, stubs, articles, longarticles)
overallStats[language]['average_size'] = self.GetAverageSize(totalCount, overallStats[language]['total_size'])
return overallStats
def GetArticleType(self, charCount):
if not charCount:
return 'absent'
if charCount < 10000:
return 'stubs'
if charCount < 30000:
return 'articles'
return 'longarticles'
def GetInterwikiLength(self, text):
result = 0
interwikiPattern = re.compile(r'\[\[([a-zA-Z\-]+)\s?:(?:[^\[\]\n]*)\]\]\n*')
for interwiki in interwikiPattern.finditer(text):
lang = interwiki.group(1)
if lang in wikipedia.Family('wikipedia').langs.keys():
result += len(interwiki.group(0))
return result
def GetCommentLength(self, text):
comment_len = 0
comment_last = 0
comment_first = text.find(u'<!--', comment_last)
while comment_first > -1:
comment_last = text.find(u'-->', comment_first)
if comment_last == -1:
comment_last = comment_first + 4
comment_len += (comment_last - comment_first) - 4
comment_first = text.find(u'<!--', comment_last)
return comment_len
def GetCharCount(self, text):
return len(text)
def GetScore(self, totalCount, absent, stubs, articles, longarticles):
max_score = totalCount * 9
raw_score = stubs + (articles*4) + (longarticles*9)
if max_score > 0:
score = 100.0 * raw_score / max_score
else:
score = 0
return score
def GetAverageSize(self, totalCount, totalSize):
if totalCount > 0:
avg_size = int(round(totalSize / totalCount))
else:
avg_size = 0
return avg_size
def FormatResults(self, outputPageName, localLanguage = 'uk'):
overallStats = self.statsPerLanguage[localLanguage]
result = u'{{-start-}}{{-titlestart-}}' + outputPageName + u'{{-titlestop-}}'
result += u'Рейтинг: {0:.2f} ({1} відсутніх, {2} коротких, {3} середніх, {4} довгих)\n\n'.format(overallStats['rating'], overallStats['absent'], overallStats['stubs'], overallStats['articles'], overallStats['longarticles'])
result += u'Середній розмір статей: {0}\n\n'.format(overallStats['average_size'])
result += u'Оновлено: %s\n' % datetime.date.today()
result += u'статті позначені жовтим і червоним кольором мають бути покращені.\n'
result += u'{| class="wikitable sortable" \n|- \n! №\n'
for language in self.languages:
result += u'! ' + self.languageNames[language] + u'\n'
if language == localLanguage:
result += u'! Кількість символів\n'
result += u'! Нормований розмір\n'
for stat in self.stats:
translations = stat['translations']
articleType = stat[localLanguage]['type']
if articleType == 'stubs':
result += u"|- style='background:#ffe0e0'\n"
elif articleType == 'articles':
result += u"|- style='background:#ffffe0'\n"
elif articleType == 'longarticles':
result += u"|- style='background:#e0ffe0'\n"
else:
result += u"|- \n"
result += u'| {0} '.format(stat['number'])
for language in self.languages:
page = translations.getPage(language)
if not page is None and page.exists():
if language == localLanguage:
result += u' || {0}'.format(page.title(asLink=True, textlink=True))
else:
result += u' || [[:{0}:{1}|{1}]]'.format(language, page.title())
else:
result += u' || відсутня'
if language==localLanguage:
result += u' || ALIGN=RIGHT | {0}'.format(stat[language]['char_count'])
result += u' || ALIGN=RIGHT | {0}'.format(stat[language]['clear_size'])
result += '\n'
result += u'|}{{-stop-}}'
return result
#main code
wikipedia.output('Reading list from \'list.txt\'...')
pages = [page for page in pagegenerators.TextfilePageGenerator('list.txt')]
wikipedia.output(u'Getting interwikis for every page...')
translationList = TranslationsList(pages, languages=languages)
wikipedia.output(u'Calculating statistics...')
stats = StatisticsCalculator(translationList, languageNames, languageWeights)
wikipedia.output(u'Writing stats to \'stats.txt\'...')
with open('stats.txt', 'w') as f:
f.write(stats.FormatResults(outputPage, localLanguage).encode("utf_8"))
wikipedia.output(u'Writing list of articles to \'listLocal.txt\'...')
with open('listLocal.txt', 'w') as f:
for translation in translationList:
page = translation.getPage(localLanguage)
if not page is None and page.exists():
f.write("{0}\n".format(page.title(asLink=True).encode("utf_8")))
# -*- coding: cp1251 -*-
import sys
sys.path.append('..\..\PyWikipediaBot')
import pagefromfile, runpy
sys.argv[1:] = [u'-file:stats.txt',
u'-summary:Оновлення статистики',
u'-force',
u'-notitle',
u'-titlestart:{{-titlestart-}}',
u'-titleend:{{-titlestop-}}',
u'-start:{{-start-}}',
u'-end:{{-stop-}}',
u'-minor']
runpy.run_module('pagefromfile', run_name="__main__", alter_sys=True)
# -*- coding: utf-8 -*-
import sys
sys.path.append('..\..\PyWikipediaBot')
import wikipedia
import catlib
import pagegenerators
#configuration
local_lang = "uk"
list_filename = "listLocal.txt"
list_category = u'Категорія:Статті, що повинні бути у всіх Вікіпедіях'
#main code
wikipedia.output('Reading list from "%s"...' % list_filename)
list_on_meta = set(page.aslink() for page in pagegenerators.TextfilePageGenerator(list_filename))
wikipedia.output('Getting list from category "%s"...' % list_category)
cat = catlib.Category(wikipedia.Site(local_lang, 'wikipedia'), list_category)
list_in_cat = set(page.aslink() for page in pagegenerators.CategorizedPageGenerator(cat, recurse = True))
wikipedia.output('Redundant articles in category:')
for article in list_in_cat:
if not article in list_on_meta:
wikipedia.output(article.title())
wikipedia.output('Missed articles in category:')
for article in list_on_meta:
if not article in list_in_cat:
wikipedia.output(article.title())