Користувач:DixonD/Statscode

Матеріал з Вікіпедії — вільної енциклопедії.
Перейти до навігації Перейти до пошуку
  • getList1000.py — отримання списку 1000 необхідних
  • getListWikitable.py — отримання списку з вікітаблиць
  • stats.py — головний код обчислення статистики
  • upload.py — оновлення сторінки
  • checkCategory.py — перевірка категорії 1000 необхідних на відповідність зі списком

getList1000.py

[ред. | ред. код]
# -*- coding: utf-8 -*-

import sys
sys.path.append('..\..\PyWikipediaBot')

import wikipedia

article_name = 'List of articles every Wikipedia should have'
 
meta_wiki = wikipedia.Site('meta', 'meta')
meta_page = wikipedia.Page(meta_wiki, article_name)
article   = meta_page.get(get_redirect=False)
 
f = open('list.txt', 'w')
count = 0
grand_total = 0
 
name_last  = 0
name_first = article.find(u'[[en:', name_last)
while name_first > -1:
    name_mid  = article.find(u'|',  name_first)
 
 
    cat_start =article.rfind(u'\n== ', name_last, name_first)
    if cat_start > -1:
        cat_end   = article.find(u'==',cat_start+3, name_first)
        if cat_end > -1: 
            cat   = article[cat_start+3:cat_end]
            print
            print cat
            print ''.center(len(cat),'-')
            count = 0
 
    name_last = article.find(u']]', name_first)
    if name_last > name_mid:
      name_last = name_mid
    article_item = article[name_first+5:name_last]
    f.write((u'[[en:' + article_item + u']]').encode("utf_8"))
    f.write('\n')
    count += 1
    grand_total += 1
    print count, article_item.encode("utf_8")
    name_first = article.find(u'[[en:', name_last)
 
f.close()
 
print ''
print 'GRAND TOTAL'
print '-----------'
print  grand_total, 'articles'

getListWikitable.py

[ред. | ред. код]
# -*- coding: utf-8 -*-
import re

import sys
sys.path.append('..\..\PyWikipediaBot')

import wikipedia

article_name = u'uk:Вікіпедія:Проект:Популярність/Популярні'
#article_name = u'uk:Вікіпедія:Проект:Популярність/2010'

article = wikipedia.Page(None, article_name).get()

f = open('list.txt', 'w')

linkPattern = re.compile(ur'\[\[(.+?)(?:\]\]|\||#)')
wikitablePattern = re.compile(ur'\{\|.*?\|\}', re.MULTILINE | re.DOTALL)

count = 0
for wikitable in wikitablePattern.findall(article):
    for link in linkPattern.findall(wikitable):
        count += 1
        title = wikipedia.Page(None, link).title(asLink=True, forceInterwiki=True)
        print count, title
        f.write(title.encode("utf_8"))
        f.write('\n')

f.close()

print ''
print 'GRAND TOTAL'
print '-----------'
print  count, 'articles'
# -*- coding: utf-8 -*-
import sys

sys.path.append('..\..\PyWikipediaBot')

import pagegenerators
import wikipedia
import datetime
import re

#configuration
preloadNumber    = 125
languageWeights  = {'uk':1.3, 'en':1.0, 'ru':1.4}
languages        = ['uk', 'en', 'ru']
languageNames    = { 'uk':u'Українська', 'en':u'Англійська', 'ru':u'Російська'}
localLanguage    = 'uk'
#outputPage       = u'Вікіпедія:Проект:Популярність/2010/stat'
outputPage       = u'Вікіпедія:Проект:Популярність/Популярні/stat'
#outputPage      = u'Вікіпедія:Статті, які повинні бути в усіх вікіпедіях/stat'

class Translations():
    def __init__(self, pages = [], languages=None):
        if languages is None:
            languages = [page.site().language() for page in pages]

        # remove duplicates
        uniqueLanguages = {}
        self.languages = [uniqueLanguages.setdefault(l,l) for l in languages if l not in uniqueLanguages]

        self.pages = {}
        for page in pages:
            self.addPage(page)

    def __iter__(self):
        for language in self.languages:
            yield self.getPage(language)

    def addPage(self, page):
        language = page.site().language()
        if language in self.languages:
            self.pages[language] = page

    def getPage(self, language):
        return self.pages[language] if language in self.pages else None


class TranslationsList():
    def __init__(self, pages, languages=None, preloadNumber = 125):
        self.preloadNumber = preloadNumber

        for page in pages:
            self.removeSection(page)

        if languages is None:
            languages = [page.site().language() for page in pages]

        # remove duplicates
        uniqueLanguages = {}
        self.languages = [uniqueLanguages.setdefault(l,l) for l in languages if l not in uniqueLanguages]

        self.numbering = dict((pages[i].urlname(), i) for i in range(len(pages)))
        self.list = [Translations(languages=languages) for i in range(len(pages))]

        # pre-load pages
        allInterwiki = []
        pages = [self.removeSection(page) for page in self.preloadPages(pages)]
        for page in pages:
            number = self.getNumber(page)
            self.list[number].addPage(page)

            if page.exists():
                interwikis = [self.removeSection(interwiki) for interwiki in page.interwiki() if interwiki.site().language() in self.languages]
                for interwiki in interwikis:
                    self.setNumber(interwiki, number)
                    allInterwiki.append(interwiki)

        for page in self.preloadPages(allInterwiki):
            self.list[self.getNumber(page)].addPage(page)

    def __iter__(self):
        for translations in self.list:
            yield translations

    def removeSection(self, page):
        page._section = None
        return page

    def getNumber(self, page):
        key = page.urlname()
        return self.numbering[key] if key in self.numbering else -1

    def setNumber(self, page, number):
        self.numbering[page.urlname()] = number

    def preloadPages(self, pages, resolveRedirects=True, preserveOrdering = True):
        # do not load pages that have been loaded already
        result = [page for page in pages if hasattr(page, '_contents')]
        result.extend([page for page in pagegenerators.PreloadingGenerator([page for page in pages if not hasattr(page, '_contents')], pageNumber=self.preloadNumber)])

                
        if resolveRedirects:
            possibleRedirects = list(result)
            result = []
            while len(possibleRedirects):
                gen = self.preloadPages(possibleRedirects, resolveRedirects=False, preserveOrdering = False)
                possibleRedirects = []
                for page in gen:
                    if page.isRedirectPage():
                        target = page.getRedirectTarget()
                        self.setNumber(target, self.getNumber(page))
                        possibleRedirects.append(target)
                    else:
                        result.append(page)

        if preserveOrdering:
            result.sort(key=self.getNumber)

        return result

    def getLanguages(self):
        return self.languages



class StatisticsCalculator():

    def __init__(self, translationList, languageNames = {}, languageWeights = {}):
        self.translationList = translationList
        self.languages = self.translationList.getLanguages()
        self.languageNames = languageNames
        self.languageWeights = languageWeights

        self.stats = [self.StatisticsForTranslations(translations) for translations in self.translationList]
        for i in range(len(self.stats)):
            self.stats[i]['number'] = i+1
        self.statsPerLanguage = self.CalculateStatsPerLanguage()

    def StatisticsForPage(self, page):
        if page is None or not page.exists():
            return {'char_count': 0, 'clear_size': 0, 'type': 'absent'}

        text = page.get().replace('\r\n', '\n')
        articleCharCount = self.GetCharCount(text)
        interwikiLength = self.GetInterwikiLength(text)
        commentsLength = self.GetCommentLength(text)
        clearArticleSize = (articleCharCount - interwikiLength - commentsLength)*self.languageWeights[page.site().language()]
        articleType = self.GetArticleType(clearArticleSize)

        return {'char_count': articleCharCount, 'clear_size': clearArticleSize, 'type': articleType}

    def StatisticsForTranslations(self, translations):
        result = {'translations':translations}
        for language in self.languages:
            result[language] = self.StatisticsForPage(translations.getPage(language))
        return result

    def CalculateStatsPerLanguage(self):
        overallStats = dict((language, {
            'total_size': 0,
            'absent': 0,
            'stubs': 0,
            'articles': 0,
            'longarticles': 0,

        }) for language in self.languages)

        for language in self.languages:
            for stat in self.stats:
                articleType = stat[language]['type']
                overallStats[language][articleType] += 1
                overallStats[language]['total_size'] += stat[language]['clear_size']

            absent       = overallStats[language]['absent']
            stubs        = overallStats[language]['stubs']
            articles     = overallStats[language]['articles']
            longarticles = overallStats[language]['longarticles']
            totalCount   = absent + stubs + articles + longarticles

            overallStats[language]['total_count'] = totalCount
            overallStats[language]['rating'] = self.GetScore(totalCount, absent, stubs, articles, longarticles)
            overallStats[language]['average_size'] = self.GetAverageSize(totalCount, overallStats[language]['total_size'])

        return overallStats

    def GetArticleType(self, charCount):
       if not charCount:
          return 'absent'
       if charCount < 10000:
          return 'stubs'
       if charCount < 30000:
          return 'articles'
       return 'longarticles'

    def GetInterwikiLength(self, text):
        result = 0
        interwikiPattern = re.compile(r'\[\[([a-zA-Z\-]+)\s?:(?:[^\[\]\n]*)\]\]\n*')
        for interwiki in interwikiPattern.finditer(text):
            lang = interwiki.group(1)
            if lang in wikipedia.Family('wikipedia').langs.keys():
                result += len(interwiki.group(0))

        return result

    def GetCommentLength(self, text):
        comment_len   = 0
        comment_last  = 0
        comment_first = text.find(u'<!--', comment_last)
        while comment_first > -1:
            comment_last = text.find(u'-->', comment_first)
            if comment_last == -1:
               comment_last = comment_first + 4

            comment_len += (comment_last - comment_first) - 4
            comment_first = text.find(u'<!--', comment_last)

        return comment_len

    def GetCharCount(self, text):
        return len(text)

    def GetScore(self, totalCount, absent, stubs, articles, longarticles):
        max_score = totalCount * 9
        raw_score = stubs + (articles*4) + (longarticles*9)
        if max_score > 0:
            score = 100.0 * raw_score / max_score
        else:
            score = 0
        return score

    def GetAverageSize(self, totalCount, totalSize):
        if totalCount > 0:
           avg_size = int(round(totalSize / totalCount))
        else:
           avg_size = 0
        return avg_size


    def FormatResults(self, outputPageName, localLanguage = 'uk'):
        overallStats = self.statsPerLanguage[localLanguage]
        result = u'{{-start-}}{{-titlestart-}}' + outputPageName + u'{{-titlestop-}}'
        result += u'Рейтинг: {0:.2f} ({1} відсутніх, {2} коротких, {3} середніх, {4} довгих)\n\n'.format(overallStats['rating'], overallStats['absent'], overallStats['stubs'], overallStats['articles'], overallStats['longarticles'])
        result += u'Середній розмір статей: {0}\n\n'.format(overallStats['average_size'])
        result += u'Оновлено: %s\n' % datetime.date.today()
        result += u'статті позначені жовтим і червоним кольором мають бути покращені.\n' 

        result += u'{| class="wikitable sortable" \n|- \n! №\n'
        for language in self.languages:
            result += u'! ' + self.languageNames[language] + u'\n'
            if language == localLanguage:
                result += u'! Кількість символів\n'
            result += u'! Нормований розмір\n'

        for stat in self.stats:
            translations = stat['translations']

            articleType = stat[localLanguage]['type']

            if articleType == 'stubs':
                result += u"|- style='background:#ffe0e0'\n"
            elif articleType == 'articles':
                result += u"|- style='background:#ffffe0'\n"
            elif articleType == 'longarticles':
                result += u"|- style='background:#e0ffe0'\n"
            else:
                result += u"|- \n"

            result += u'| {0} '.format(stat['number'])

            for language in self.languages:
                page = translations.getPage(language)
                if not page is None and page.exists():
                    if language == localLanguage:
                        result += u' || {0}'.format(page.title(asLink=True, textlink=True))
                    else:
                        result += u' || [[:{0}:{1}|{1}]]'.format(language, page.title())
                else:
                    result += u' || відсутня'

                if language==localLanguage:
                    result += u' || ALIGN=RIGHT | {0}'.format(stat[language]['char_count'])
                result += u' || ALIGN=RIGHT | {0}'.format(stat[language]['clear_size'])

            result += '\n'

        result += u'|}{{-stop-}}'
        return result


#main code
wikipedia.output('Reading list from \'list.txt\'...')
pages = [page for page in pagegenerators.TextfilePageGenerator('list.txt')]

wikipedia.output(u'Getting interwikis for every page...')
translationList = TranslationsList(pages, languages=languages)

wikipedia.output(u'Calculating statistics...')
stats = StatisticsCalculator(translationList, languageNames, languageWeights)

wikipedia.output(u'Writing stats to \'stats.txt\'...')
with open('stats.txt', 'w') as f:
    f.write(stats.FormatResults(outputPage, localLanguage).encode("utf_8"))

wikipedia.output(u'Writing list of articles to \'listLocal.txt\'...')
with open('listLocal.txt', 'w') as f:
    for translation in translationList:
        page = translation.getPage(localLanguage)
        if not page is None and page.exists():
            f.write("{0}\n".format(page.title(asLink=True).encode("utf_8")))

upload.py

[ред. | ред. код]
# -*- coding: cp1251 -*-

import sys
sys.path.append('..\..\PyWikipediaBot')

import pagefromfile, runpy

sys.argv[1:] = [u'-file:stats.txt',
                u'-summary:Оновлення статистики',
                u'-force',
                u'-notitle',
                u'-titlestart:{{-titlestart-}}',
                u'-titleend:{{-titlestop-}}',
                u'-start:{{-start-}}',
                u'-end:{{-stop-}}',
                u'-minor']
runpy.run_module('pagefromfile', run_name="__main__", alter_sys=True)

checkCategory.py

[ред. | ред. код]
# -*- coding: utf-8 -*-

import sys

sys.path.append('..\..\PyWikipediaBot')

import wikipedia
import catlib
import pagegenerators

#configuration
local_lang      = "uk"
list_filename   = "listLocal.txt"
list_category   = u'Категорія:Статті, що повинні бути у всіх Вікіпедіях'

#main code
wikipedia.output('Reading list from "%s"...' % list_filename)
list_on_meta = set(page.aslink() for page in pagegenerators.TextfilePageGenerator(list_filename))

wikipedia.output('Getting list from category "%s"...' % list_category)
cat = catlib.Category(wikipedia.Site(local_lang, 'wikipedia'), list_category)
list_in_cat = set(page.aslink() for page in pagegenerators.CategorizedPageGenerator(cat, recurse = True))

wikipedia.output('Redundant articles in category:')
for article in list_in_cat:
    if not article in list_on_meta:
        wikipedia.output(article.title())

wikipedia.output('Missed articles in category:')
for article in list_on_meta:
    if not article in list_in_cat:
        wikipedia.output(article.title())