source: orange/orange/doc/getWordDictionary.py @ 4491:71ff7263d98a

Revision 4491:71ff7263d98a, 1.2 KB checked in by janezd <janez.demsar@…>, 6 years ago (diff)
Line 
1import os, os.path, re, cPickle
2
3allWords = {}
4
5def searchWords(path, category, recurse):
6    for fn in os.listdir(path):
7        nfn = path + "/" + fn
8        if os.path.isdir(nfn):
9            if recurse:
10                searchWords(nfn, category, recurse)
11        elif fn[-5:] == ".html" or fn[-4:] == ".htm":
12            addWords(nfn, category)
13
14
15re_word = re.compile(r"\W(?P<word>\w\w\w+)\W")
16re_dottedWord = re.compile(r"\W(?P<word>\w+(\.\w+)+)")
17
18def addWord(word, category):
19    if allWords.has_key(word):
20        if category not in allWords[word]:
21            allWords[word].append(category)
22    else:
23        allWords[word] = [category]
24   
25def addWords(nfn, category):
26    content = file(nfn).read()
27    for wm in re_word.finditer(content):
28        addWord(wm.group("word"), category)
29    for wm in re_dottedWord.finditer(content):
30        addWord(wm.group("word"), category)
31
32categories = [("widgets/catalog", 1),
33              ("ofb", 1),
34              ("modules", 1),
35              ("reference", 1),
36              ("widgets", 0)]
37
38for category, (path, recurse) in enumerate(categories):
39    searchWords(path, category, recurse)
40
41cPickle.dump(allWords, file("wordDict.pickle", "wb"))
Note: See TracBrowser for help on using the repository browser.