#!/usr/bin/env python
#    (C) 2008-2015 Parag Nemade <pnemade@redhat.com>
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
#    02110-1301, USA.

import codecs
import os
import sys
import re

wordsfile = "/tmp/words.txt"
wordstmpfile = "/tmp/wordsdic.txt"
dicfile = "/tmp/words.dic"

class WordXt(object):
    ''' Class for extracting sorted uniq word list '''
    def __init__(self):
        arguments = sys.argv[1:]
        count = len(arguments)
        if count < 2 or count > 2:
            print "usage:wordxtr <LangCode> <Full directory path to text data files>"
            exit()
        if len(sys.argv[1]) < 5 or len(sys.argv[1]) > 6:
            print "Enter correct language isocode along with country code."
            print "Check /usr/share/iso-codes/iso_639.tab for language code"
            print "e.g. For Hindi language use hi_IN or for Nepali language use ne_NP"
            exit()
        else:
            self.lng = sys.argv[1]

        self.dirname = sys.argv[2]
        if not os.path.exists(self.dirname):
            print "Enter full directory path where text files exists "
            exit()

        print "Creating dictionary for language \"%s\"" %self.lng + \
                  " using text data in directory \"%s\"" %self.dirname
        self.Create(self.dirname)

    def read_file(self, filename):
        ''' Read the input file '''
        filen = codecs.open(filename, "r", "utf-8")
        fOut = codecs.open(wordsfile, "w", "utf-8")

        u = filen.readlines()
        u = [x.strip() for x in u]
        for word in u:
            if word.startswith('msgstr "'):
                wl = word[8:-1].split(' ')
            else:
                wl = word.split(' ')
            nSentences = len(wl)
            if nSentences > 1:
                for w in wl:
                    if len(w) > 1:
                        if w.find('%', 1) < 1:
                            fOut.write(w+'\n')
            else:
                fOut.write(word+'\n')
        fOut.close()

    def extract_words(self):
        ''' Extract the words from input file(s) '''
        f = open(wordsfile)
        content = f.read()

        noline = re.sub('(\n)+', ' ', content)
        line_pat = re.sub('[.!:?]', '\n', noline)
        line_pat2 = re.sub("[,\-']", ' ', line_pat)
        line_pat3 = re.sub("[,';:+{}=/@&*!~`#$^|_?<>]", ' ', line_pat2)
        line_pat4 = re.sub('"', '', line_pat3)
        line_pat5 = re.sub('\\\\', ' ', line_pat4)
        line_pat6 = re.sub('\[', '', line_pat5)
        line_pat7 = re.sub('\]', '', line_pat6)
        line_pat8 = re.sub("[()%1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]", '', line_pat7)

        sentences = line_pat8.split('\n')
        nSentences = len(sentences)
	#print 'The text has ' + str(nSentences) + ' sentences.'

        words = []
        for s in sentences:
            words = words + s.split(' ')
            words = [w for w in words if len(w) > 0]

        nWords = len(words)
        #print 'The text has a total of ' + str(nWords) + ' words in it.'

        fOut = codecs.open(wordstmpfile, "w", "utf-8")
        wDict = {}
        for w in words:
            if wDict.has_key(w):
                wDict[w] += 1
            else:
                wDict[w] = 1
            fOut.write(w.decode('utf-8')+'\n')
        fOut.close()
        #print 'There are a total of ' + str(len(wDict)) + ' words in this text'

    def remove_dups(self):
        ''' Remove duplicate words '''
        f = codecs.open(wordstmpfile, "r", "utf-8")
        f2 = codecs.open(dicfile, "w", "utf-8")
        uniquelines = set(f.read().split("\n"))
        f2.write("".join([line + "\n" for line in uniquelines]))
        f2.close()

    def Create(self, dirname):
        ''' Create sorted unique word list '''
        tmpname = "/tmp/words.dic"
        alltext = "/tmp/" + "%s" %self.lng + ".dat"
        fOut = codecs.open(alltext, "w+", "utf-8")

        print "00%....Creating Text Data to Parse"
        for root, dirs, files in os.walk(dirname):
            for fname in files:
                filen = codecs.open(os.path.join(root, fname), "r", "utf-8")
                fOut.write(filen.read())
                filen.close()
        print "25%....Reading Text Data to Parse"
        self.read_file(alltext)
        print "50%....Created Text Data to Parse"
        self.extract_words()
        print "65%....Extracted words from input Text Data"
        self.remove_dups()
        print "80%....Removed duplicated words from extracted wordlist"
        cmd = "/usr/bin/wordlist2hunspell %s " %tmpname + "%s " %self.lng
        os.system(cmd)
        print "......in current directory"
        exit()

if __name__ == "__main__":
    WordXt()
    main()
