Uploaded by Петя Гурьев

лингвистическая нейросеть

advertisement
import sys
import numpy as np
import pickle
import re
from Stemmer import Stemmer
from sklearn.feature_extraction.text import TfidVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
def text_cleaner(text):
text = text.lower()
stemmer = Stemmer('russian')
text = ' '.join( stemmer.stemWords(text.split() ) )
text = re.sub( r'\b\d+\b', ' digit ', text )
return text
def load_data():
data = { 'text':[],'tag':[] }
for line in open('learning.txt'):
if(not('#' in line)):
row = line.split("@")
data['text'] += [row[0]]
data['tag'] += [row[1]]
return data
def train_test_split( data, validation_split = 0.1):
sz = len(data['text'])
indices = np.arange(sz)
np.random.shuffle(indices)
X = [ data['text'][i] for i in indices ]
Y = [ data['tag'][i] for i in indices ]
nb_validation_samples = int(validation_split * sz)
return {
'train': { 'x': X[:nb_validation_samples], 'y': Y[:nb_validation_samples] },
'text': {'x': X[nb_validation_samples:], 'y': Y[nb_validation_samples:] }
}
def openai():
data = load_data()
D = train_test_split( data )
text_clf = Pipeline([
('tfidf',TfidVectorizer()),
('clf',SGDClassifier(loss='hinge')),
])
text_clf.fit(D['train']['x'], D['train']['x'] )
z=input('Введите слова через запятую и пробел: ')
zz=[]
zz.append(z)
predicted = text_clf.predict(zz)
print(predicted[0])
if __name__ == '__main__':
sys.exit(openai())
Download