The text below is selected, press Ctrl+C to copy to your clipboard. (⌘+C on Mac) No line numbers will be copied.
Guest
Web
By Guest on 17th September 2018 09:48:59 PM | Syntax: TEXT | Views: 1



New paste | Download | Show/Hide line no. | Copy text to clipboard
  1. CRAWLER
  2. from html.parser import HTMLParser
  3. from urllib.request import urlopen
  4. from urllib import parse
  5. import json
  6. class LinkParser(HTMLParser):
  7.     def handle_starttag(self, tag, attrs):
  8.         if tag == 'a':
  9.             for (key,value) in attrs:
  10.                 if key == 'href':
  11.                     newURL=parse.urljoin(self.baseURL,value)
  12.                     self.links.append(newURL)
  13.     def getLinks(self, url, num):
  14.         self.links = []
  15.         self.baseURL = url
  16.         response=urlopen(url)
  17.         x = response.read()
  18.         if 'text/html' in response.getheader('Content-Type'):
  19.             htmlString= x.decode('utf-8')
  20.             self.feed(htmlString)
  21.             fname=str(num)+'.txt'
  22.             file = open(fname,'wb')
  23.             file.write(x)
  24.             file.close()
  25.             f = open("DataBase",'a')
  26.             f.write(url+'\n')
  27.             f.close()
  28.             print("Successful Crawl of ", num)
  29.             return self.links
  30.         else:
  31.             return []
  32. def spider(url,maxP):
  33.     pages=[url]
  34.     num=0
  35.     while num<maxP and pages != []:
  36.         num = num + 1
  37.         url = pages[0]
  38.         pages = pages[1:]
  39.         try:
  40.             print(num, " Visiting ", url, "\n\n")
  41.             parser= LinkParser()
  42.             links=parser.getLinks(url,num)
  43.             pages = pages + links
  44.         except Exception as e:
  45.             print(str(e))
  46. spider('https://www.amazon.in/', 50)
  47.  
  48. COSINESIMILARITY
  49. from sklearn.feature_extraction.text import CountVectorizer
  50. from sklearn.feature_extraction.text import TfidfVectorizer
  51. from sklearn.metrics.pairwise import cosine_similarity
  52. import glob
  53.  
  54. query ="Wiki is blah blah "
  55. fname=[]
  56. train_set=[]
  57. d={}
  58. train_set.append(query)
  59. flist=glob.glob('*.txt')
  60. for name in flist:
  61.     fname.append(name)
  62.     f = open(name, encoding ="UTF-8")
  63.     a = f.read()
  64.     train_set.append(a)
  65.     f.close()
  66. tfidf= TfidfVectorizer()
  67. tfidfm = tfidf.fit_transform(train_set)
  68. cosineval=[]
  69. for i in range(1,len(train_set)):
  70.     c = cosine_similarity(tfidfm[0:1], tfidfm[i])
  71.     c=str(c)
  72.     c=c[2:]
  73.     c=c[:-2]
  74.     c=float(c)
  75.     cosineval.append(c)
  76. for j in range(0,len(fname)):
  77.     d[j]=cosineval[j]
  78. cosineval.sort(reverse=True)
  79. file = open("FileCheckDatabase",'r')
  80. data=file.read()
  81. l = data.split()
  82. for i in range(0,10):
  83.     x=list(d.keys())[list(d.values()).index(cosineval[i])]
  84.     h=fname[x]
  85.     h=int(h[:-4])
  86.     print(fname[x])
  87.     print(cosineval[i])
  88.     print(l[h-1])
  89.     print("\n\n")
  90.  
  91. GRAPH
  92. import matplotlib.pyplot as plt
  93. import networkx as nx
  94. import scipy as sp
  95. import numpy as np
  96. G = nx.DiGraph()
  97. [G.add_node(k) for k in [0,1,2,3,4,5,6,7,8,9]]
  98. G.add_edges_from([(0,5),(1,5),(2,4),(3,4),(0,3),(4,6),(5,6),(5,4),(5,3),(6,7),(4,8),(3,9),(8,9),(6,9)])
  99. print(nx.info(G))
  100. print("\tDegree Centrality:\n")
  101. print(nx.degree_centrality(G))
  102. print("\n\tCloseness Centrality:\n")
  103. print(nx.closeness_centrality(G, u=None, distance=None, wf_improved=True, reverse=False))
  104. print("\n\tBetweenness Centrality:\n")
  105. print(nx.betweenness_centrality(G, k=None, normalized=False, weight=None, endpoints=False, seed=None))
  106. print("\tDegree Prestige:\n")
  107. print(nx.in_degree_centrality(G))
  108. print("\n\tProximity Prestige:\n")
  109. length = nx.all_pairs_shortest_path_length(G)
  110. d = dict(length)
  111. final_dict = {}
  112. for i in range(1, G.number_of_nodes()+1):
  113.     k = 0
  114.     counter=0
  115.     for j in range(1, G.number_of_nodes()+1):
  116.         try:
  117.             k = k + d[j][i]
  118.             counter+=1
  119.         except:
  120.             break
  121.     if(counter!=1):
  122.         final_dict[i] = k/(counter-1)
  123. print (final_dict)
  124. print("\n\tRank Prestige:\n")
  125. print(nx.pagerank(G))
  126. A = nx.adjacency_matrix(G).todense()
  127. AT=A.transpose()
  128. print("Co-Citation Matrix=\n")
  129. print(np.matmul(AT,A))
  130. print("\n")
  131. c = [[10]*10 for i in range(10)]
  132. for i in range(10):
  133.     add=0
  134.     for j in range(10):
  135.         add=add+A.item(i,j)
  136.         if(A.item(i,j)!=0):
  137.             c.append(j)
  138.     if(add>1):
  139.         print(i," co-cites nodes:")
  140.         while(add>0):
  141.             cite=c.pop()
  142.             print(cite)
  143.             add=add-1
  144.         print("\n")
  145. print("Bibliographical Coupling Matrix=\n")
  146. print(np.matmul(A,AT))
  147. print("\n")
  148. b = [[10]*10 for i in range(10)]
  149. for i in range(10):
  150.     add=0
  151.     for j in range(10):
  152.         add=add+A.item(j,i)
  153.         if(A.item(j,i)!=0):
  154.             b.append(j)
  155.     if(add>1):
  156.         print(i," is co-cited by nodes:")
  157.         while(add>0):
  158.             cite=b.pop()
  159.             print(cite)
  160.             add=add-1
  161.         print("\n")
  162. def pagerank(G, alpha=0.85, personalization=None,max_iter=100, tol=1.0e-6, nstart=None, weight='weight',dangling=None):
  163.         if len(G) == 0:
  164.                 return {}
  165.         if not G.is_directed():
  166.                 D = G.to_directed()
  167.         else:
  168.                 D = G
  169.         W = nx.stochastic_graph(D, weight=weight)
  170.         N = W.number_of_nodes()
  171.         if nstart is None:
  172.                 x = dict.fromkeys(W, 1.0 / N)
  173.         else:
  174.                 s = float(sum(nstart.values()))
  175.                 x = dict((k, v / s) for k, v in nstart.items())
  176.         p = dict.fromkeys(W, 1.0 / N)
  177.         dangling_weights = p
  178.         dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
  179.         for _ in range(max_iter):
  180.                 xlast = x
  181.                 x = dict.fromkeys(xlast.keys(), 0)
  182.                 danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
  183.                 for n in x:
  184.                         for nbr in W[n]:
  185.                                 x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
  186.                         x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n]
  187.                 err = sum([abs(x[n] - xlast[n]) for n in x])
  188.                 if err < N*tol:
  189.                         return x
  190. print(pagerank(G))
  191. h,a=nx.hits(G)
  192. print("\nHubs (outgoing):")
  193. print(h)
  194. print("\nAuthorities (incoming):")
  195. print(a)
  196. nx.draw(G)
  197. plt.show()
  198.  
  199. input_user = 2
  200.  
  201. l = []
  202. h = {}
  203. for i in range(0, g.number_of_nodes()):
  204. a = 1/g.number_of_nodes()
  205. h[i+1] = a
  206.  
  207. l.append(h)
  208.  
  209. x = 1
  210.  
  211. while (x<=input_user):
  212. #print (l)
  213. h1 = {}
  214. for i in range(1, g.number_of_nodes()+1):
  215. iter_list1 = list(g.in_edges(i))
  216. sum1 = 0
  217. #print (iter_list1)
  218. for j in range(0, len(iter_list1)):
  219. first_one = iter_list1[j][0]
  220. #print (first_one)
  221. sum1 = l[x-1][first_one]/g.out_degree(first_one) + sum1 h1[i] = sum1
  222. l.append(h1)
  223. #print (l)
  224. '''h2 = {}
  225. for i in range(1, g.number_of_nodes()+1):
  226. iter_list2 = list(g.in_edges(i))
  227.  
  228. sum2 = 0
  229.  
  230. for j in range(0, len(iter_list2)):
  231. second_one = iter_list2[j][0]
  232.  
  233. sum2 = l[1][second_one]/g.out_degree(second_one) + sum2 h2[i] = sum2
  234. l.append(h2)
  235. print (l)
  236. print (l[2])'''
  237. x = x+1
  238. print ('Rank Prestige')
  239. print (l[x-1])
  240.  
  241.  
  242. #       K-Nearest Neighbors (K-NN)
  243. #Importing the libraries
  244. import numpy as np
  245. import matplotlib.pyplot as plt
  246. import pandas as pd
  247. #Importing the dataset
  248. dataset = pd.read_csv('Social_Network_Ads.csv')
  249. X = dataset.iloc[:, [2, 3]].values
  250. y = dataset.iloc[:, 4].values
  251. #Splitting the dataset into the Training set and Test set
  252. from sklearn.cross_validation import train_test_split
  253. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
  254. #Feature Scaling
  255. from sklearn.preprocessing import StandardScaler sc = StandardScaler()
  256. X_train = sc.fit_transform(X_train)
  257. X_test = sc.transform(X_test)
  258.  #Fitting K-NN to the Training set
  259. from sklearn.neighbors import KNeighborsClassifier
  260. classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  261. classifier.fit(X_train, y_train)
  262. #Predicting the Test set results
  263. y_pred = classifier.predict(X_test)
  264. #Making the Confusion Matrix
  265. from sklearn.metrics import confusion_matrix
  266. cm = confusion_matrix(y_test, y_pred)
  267. from matplotlib.colors import ListedColormap
  268. X_set, y_set = X_train, y_train
  269. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  270. 0.01),np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  271. plt.xlim(X1.min(), X1.max())
  272. plt.ylim(X2.min(), X2.max())
  273. for i, j in enumerate(np.unique(y_set)):
  274. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  275. c = ListedColormap(('red', 'green'))(i), label = j)
  276. plt.title('K-NN (Training set)')
  277. plt.xlabel('Age')
  278. plt.ylabel('Estimated Salary')
  279. plt.legend()
  280. plt.show()
  281. # Visualising the Test set results
  282.  
  283. from matplotlib.colors import ListedColormap
  284. X_set, y_set = X_test, y_test
  285. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  286. 0.01),
  287.  
  288. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  289. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  290. plt.xlim(X1.min(), X1.max())
  291. plt.ylim(X2.min(), X2.max())
  292. for i, j in enumerate(np.unique(y_set)):
  293. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  294. c = ListedColormap(('red', 'green'))(i), label = j)
  295. plt.title('K-NN (Test set)')
  296. plt.xlabel('Age')
  297. plt.ylabel('Estimated Salary')
  298. plt.legend()
  299.  
  300. plt.show()
  301.  
  302.  
  303. Naive Bayes
  304.  
  305. #Importing the libraries
  306. import numpy as np
  307. import matplotlib.pyplot as plt
  308. import pandas as pd
  309. #Importing the dataset
  310. dataset = pd.read_csv('Social_Network_Ads.csv')
  311. X = dataset.iloc[:, [2, 3]].values
  312. y = dataset.iloc[:, 4].values
  313. #Splitting the dataset into the Training set and Test set
  314. from sklearn.cross_validation import train_test_split
  315. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
  316. #Feature Scaling
  317. from sklearn.preprocessing import StandardScaler sc = StandardScaler()
  318. X_train = sc.fit_transform(X_train)
  319. X_test = sc.transform(X_test)
  320. #Fitting Naive Bayes to the Training se
  321. from sklearn.naive_bayes import GaussianNB
  322. classifier = GaussianNB()
  323. classifier.fit(X_train, y_train)
  324. #Predicting the Test set results y_pred = classifier.predict(X_test)
  325. #Making the Confusion Matrix
  326. from sklearn.metrics import confusion_matrix
  327. cm = confusion_matrix(y_test, y_pred)
  328. #Visualising the Training set results
  329. from matplotlib.colors import ListedColormap
  330. X_set, y_set = X_train, y_train
  331. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  332.  
  333. 0.01),
  334. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  335. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  336. plt.xlim(X1.min(), X1.max())
  337. plt.ylim(X2.min(), X2.max())
  338. for i, j in enumerate(np.unique(y_set)):
  339. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  340. c = ListedColormap(('red', 'green'))(i), label = j)
  341. plt.title('Naive Bayes (Training set)')
  342. plt.xlabel('Age')
  343. plt.ylabel('Estimated Salary')
  344. plt.legend()
  345.  
  346. plt.show()
  347.  
  348. # Visualising the Test set results
  349.  
  350. from matplotlib.colors import ListedColormap
  351. X_set, y_set = X_test, y_test
  352. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  353. 0.01),
  354.  
  355. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  356. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  357. plt.xlim(X1.min(), X1.max())
  358. plt.ylim(X2.min(), X2.max())
  359. for i, j in enumerate(np.unique(y_set)):
  360. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  361. c = ListedColormap(('red', 'green'))(i), label = j)
  362. plt.title('Naive Bayes (Test set)')
  363. plt.xlabel('Age')
  364. plt.ylabel('Estimated Salary')
  365. plt.legend()
  366.  
  367. plt.show()
  368.  
  369. Count Word Frequency
  370. The text is much better now. Let's calculate the frequency distribution of those tokens using Python NLTK. There is a function in NLTK called FreqDist() that does the job:
  371. from bs4 import BeautifulSoup
  372. import urllib.request
  373. import nltk
  374. response = urllib.request.urlopen('http://php.net/')
  375. html = response.read()
  376. print (html)
  377. soup = BeautifulSoup(html,"html5lib")
  378. text = soup.get_text(strip=True)
  379. print (text)
  380. tokens = [t for t in text.split()]
  381. print (tokens)
  382. freq = nltk.FreqDist(tokens)
  383. for key,val in freq.items():
  384.     print (str(key) + ':' + str(val))
  385. STOPWORD REMOVAL
  386. from nltk.corpus import stopwords
  387. from nltk.tokenize import word_tokenize
  388. example_sent = "This is a sample sentence, showing off the stop words filtration."
  389. stop_words = set(stopwords.words('english'))
  390. word_tokens = word_tokenize(example_sent)
  391. filtered_sentence = []
  392. for w in word_tokens:
  393.         if w not in stop_words:
  394.                 filtered_sentence.append(w)
  395. print(word_tokens)
  396. print(filtered_sentence)
  397. STEMMING
  398. from nltk.stem import PorterStemmer
  399. from nltk.tokenize import sent_tokenize, word_tokenize
  400. ps = PorterStemmer()
  401. example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
  402. for w in example_words:
  403.         print(ps.stem(w))
  404.  
  405.  
  406. INDEXING
  407. from os import system, name
  408. import re
  409. def process_files(filenames):
  410.         file_to_terms = {}
  411.         for file in filenames:
  412.                 pattern = re.compile('[\W_]+')
  413.                 file_to_terms[file] = open(file, 'r').read().lower();
  414.                 file_to_terms[file] = pattern.sub(' ',file_to_terms[file])
  415.                 re.sub(r'[\W_]+','', file_to_terms[file])
  416.                 file_to_terms[file] = file_to_terms[file].split()
  417.         return file_to_terms
  418. def index_one_file(termlist):
  419.         fileIndex = {}
  420.         for index, word in enumerate(termlist):
  421.                 if word in fileIndex.keys():
  422.                         fileIndex[word].append(index)
  423.                 else:
  424.                         fileIndex[word] = [index]
  425.         return fileIndex
  426. def make_indices(termlists):
  427.         total = {}
  428.         for filename in termlists.keys():
  429.                 total[filename] = index_one_file(termlists[filename])
  430.         return total
  431. def fullIndex(regdex):
  432.         total_index = {}
  433.         for filename in regdex.keys():
  434.                 for word in regdex[filename].keys():
  435.                         if word in total_index.keys():
  436.                                 if filename in total_index[word].keys():
  437.                                         total_index[word][filename].extend(regdex[filename][word][:])
  438.                                 else:
  439.                                         total_index[word][filename] = regdex[filename][word]
  440.                         else:
  441.                                 total_index[word] = {filename: regdex[filename][word]}
  442.         return total_index
  443. def one_word_query(word, invertedIndex):
  444.         pattern = re.compile('[\W_]+')
  445.         word = pattern.sub(' ',word)
  446.         if word in invertedIndex.keys():
  447.                 return [filename for filename in invertedIndex[word].values()]
  448.         else:
  449.         return []
  450. def free_text_query(string,index):
  451.         pattern = re.compile('[\W_]+')
  452.         string = pattern.sub(' ',string)
  453.         result = []
  454.         for word in string.split():
  455.                 result += one_word_query(word,index)
  456.         return list(set(result))
  457. def phrase_query(string, invertedIndex):
  458.         pattern = re.compile('[\W_]+')
  459.         string = pattern.sub(' ',string)
  460.         listOfLists, result = [],[]
  461.         for word in string.split():
  462.                 listOfLists.append(free_text_query(word,invertedIndex))
  463.         setted = set(listOfLists[0]).intersection(*listOfLists)
  464.         for filename in setted:
  465.                 temp = []
  466.                 for word in string.split():
  467.                         temp.append(invertedIndex[word][filename][:])
  468.                 for i in range(len(temp)):
  469.                         for ind in range(len(temp[i])):
  470.                                 temp[i][ind] -= i
  471.                 if set(temp[0]).intersection(*temp):
  472.                         result.append(filename)
  473.                 print('\n temp : \n')
  474.                 print(temp)
  475.         return result
  476. filenames=['d:/document1.txt','d:/document2.txt']
  477. termslist=process_files(filenames)
  478. print('\nterm list \n')
  479. print(termslist)
  480. print('\n\n')
  481. totaldict=make_indices(termslist)
  482. print('total dictionary \n')
  483. print(totaldict)
  484. print('\n\n')
  485. index=fullIndex(totaldict)
  486. print('full index \n')
  487. print(index)
  488. print('\n\n')
  489. #one_word_query('exceptions', index)
  490. #query_word=free_text_query('exceptions',index)
  491. #print(query_word)
  492. system('cls')
  493. print('\n\n')
  494. print('\n\n')
  495. #r=phrase_query('python has exceptions handling',index)
  496. #print (r)
  497.  
  498.  
  499. #       Support Vector Machine (SVM)
  500.  
  501. #       Importing the libraries
  502.  
  503. import numpy as np
  504. import matplotlib.pyplot as plt
  505.  
  506. import pandas as pd
  507.  
  508. # Importing the dataset
  509.  
  510. dataset = pd.read_csv('Social_Network_Ads.csv')
  511. X = dataset.iloc[:, [2, 3]].values
  512.  
  513. y = dataset.iloc[:, 4].values
  514.  
  515. #       Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split
  516.  
  517. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
  518.  
  519. #       Feature Scaling
  520.  
  521. from sklearn.preprocessing import StandardScaler sc = StandardScaler()
  522. X_train = sc.fit_transform(X_train)
  523. X_test = sc.transform(X_test)
  524.  
  525. # Fitting SVM to the Training set
  526. from sklearn.svm import SVC
  527.  
  528. classifier = SVC(kernel = 'linear', random_state = 0)
  529. classifier.fit(X_train, y_train)
  530.  
  531. #       Predicting the Test set results y_pred = classifier.predict(X_test)
  532.  
  533. #       Making the Confusion Matrix
  534.  
  535. from sklearn.metrics import confusion_matrix
  536. cm = confusion_matrix(y_test, y_pred)
  537.  
  538. # Visualising the Training set results
  539.  
  540. from matplotlib.colors import ListedColormap
  541. X_set, y_set = X_train, y_train
  542. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  543. 0.01),
  544.  
  545. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  546. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  547. plt.xlim(X1.min(), X1.max())
  548. plt.ylim(X2.min(), X2.max())
  549. for i, j in enumerate(np.unique(y_set)):
  550. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  551. c = ListedColormap(('red', 'green'))(i), label = j)
  552. plt.title('SVM (Training set)')
  553. plt.xlabel('Age')
  554. plt.ylabel('Estimated Salary')
  555. plt.legend()
  556.  
  557. plt.show()
  558.  
  559. # Visualising the Test set results
  560.  
  561. from matplotlib.colors import ListedColormap
  562. X_set, y_set = X_test, y_test
  563. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  564. 0.01),
  565.  
  566. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  567. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  568. plt.xlim(X1.min(), X1.max())
  569. plt.ylim(X2.min(), X2.max())
  570. for i, j in enumerate(np.unique(y_set)):
  571. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  572. c = ListedColormap(('red', 'green'))(i), label = j)
  573. plt.title('SVM (Test set)')
  574. plt.xlabel('Age')
  575. plt.ylabel('Estimated Salary')
  576. plt.legend()
  577.  
  578. plt.show()
  579.  
  580. #       Kernel SVM
  581.  
  582. #       Importing the dataset
  583.  
  584. dataset = read.csv('Social_Network_Ads.csv')
  585. dataset = dataset[3:5]
  586.  
  587. # Encoding the target feature as factor
  588.  
  589. dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
  590.  
  591. #       Splitting the dataset into the Training set and Test set
  592.  
  593. #       install.packages('caTools')
  594. library(caTools)
  595.  
  596. set.seed(123)
  597. split = sample.split(dataset$Purchased, SplitRatio = 0.75)
  598. training_set = subset(dataset, split == TRUE)
  599. test_set = subset(dataset, split == FALSE)
  600.  
  601. # Feature Scaling
  602.  
  603. training_set[-3] = scale(training_set[-3])
  604. test_set[-3] = scale(test_set[-3])
  605.  
  606. #       Fitting Kernel SVM to the Training set
  607.  
  608. #       install.packages('e1071')
  609. library(e1071)
  610. classifier = svm(formula = Purchased ~ .,
  611. data = training_set, type = 'C-classification', kernel = 'radial')
  612.  
  613. #       Predicting the Test set results
  614.  
  615. y_pred = predict(classifier, newdata = test_set[-3])
  616.  
  617. #       Making the Confusion Matrix cm = table(test_set[, 3], y_pred)
  618.  
  619. #       Visualising the Training set results library(ElemStatLearn)
  620.  
  621. set = training_set
  622. X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
  623. X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01) grid_set = expand.grid(X1, X2) colnames(grid_set) = c('Age', 'EstimatedSalary') y_grid = predict(classifier, newdata = grid_set) plot(set[, -3],
  624.  
  625. main = 'Kernel SVM (Training set)', xlab = 'Age', ylab = 'Estimated Salary', xlim = range(X1), ylim = range(X2))
  626.  
  627. contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE) points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato')) points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
  628.  
  629. #       Visualising the Test set results
  630.  
  631. library(ElemStatLearn)
  632. set = test_set
  633. X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
  634. X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
  635. grid_set = expand.grid(X1, X2)
  636. colnames(grid_set) = c('Age', 'EstimatedSalary')
  637. y_grid = predict(classifier, newdata = grid_set)
  638. plot(set[, -3], main = 'Kernel SVM (Test set)',
  639. xlab = 'Age', ylab = 'Estimated Salary',
  640. xlim = range(X1), ylim = range(X2))
  641. contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
  642. points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
  643.  
  644. points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', ‘red3'))
  645. #      
  646. #       Decision Tree Classification
  647.  
  648. #       Importing the libraries
  649.  
  650. import numpy as np
  651. import matplotlib.pyplot as plt
  652.  
  653. import pandas as pd
  654.  
  655. # Importing the dataset
  656.  
  657. dataset = pd.read_csv('Social_Network_Ads.csv')
  658. X = dataset.iloc[:, [2, 3]].values
  659.  
  660. y = dataset.iloc[:, 4].values
  661.  
  662. #       Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split
  663.  
  664. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
  665.  
  666. #       Feature Scaling
  667.  
  668. from sklearn.preprocessing import StandardScaler sc = StandardScaler()
  669. X_train = sc.fit_transform(X_train)
  670. X_test = sc.transform(X_test)
  671.  
  672. #       Fitting Decision Tree Classification to the Training set from sklearn.tree import DecisionTreeClassifier
  673.  
  674. classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train)
  675.  
  676. #       Predicting the Test set results
  677.  
  678. y_pred = classifier.predict(X_test)
  679.  
  680. # Making the Confusion Matrix
  681.  
  682. from sklearn.metrics import confusion_matrix
  683. cm = confusion_matrix(y_test, y_pred)
  684.  
  685. # Visualising the Training set results
  686.  
  687. from matplotlib.colors import ListedColormap
  688. X_set, y_set = X_train, y_train
  689. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  690. 0.01),
  691.  
  692. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  693. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  694. plt.xlim(X1.min(), X1.max())
  695. plt.ylim(X2.min(), X2.max())
  696. for i, j in enumerate(np.unique(y_set)):
  697. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  698. c = ListedColormap(('red', 'green'))(i), label = j)
  699. plt.title('Decision Tree Classification (Training set)')
  700. plt.xlabel('Age')
  701.  
  702. plt.ylabel('Estimated Salary')
  703. plt.legend()
  704.  
  705. plt.show()
  706.  
  707. # Visualising the Test set results
  708.  
  709. from matplotlib.colors import ListedColormap
  710. X_set, y_set = X_test, y_test
  711. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  712. 0.01),
  713.  
  714. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  715. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  716. plt.xlim(X1.min(), X1.max())
  717. plt.ylim(X2.min(), X2.max())
  718. for i, j in enumerate(np.unique(y_set)):
  719. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  720. c = ListedColormap(('red', 'green'))(i), label = j)
  721. plt.title('Decision Tree Classification (Test set)')
  722. plt.xlabel('Age')
  723. plt.ylabel('Estimated Salary')
  724. plt.legend()
  725.  
  726. plt.show()
  727.  
  728.  
  729. #       Random Forest Classification
  730.  
  731. #       Importing the libraries
  732.  
  733. import numpy as np
  734. import matplotlib.pyplot as plt
  735.  
  736. import pandas as pd
  737.  
  738. # Importing the dataset
  739.  
  740. dataset = pd.read_csv('Social_Network_Ads.csv')
  741. X = dataset.iloc[:, [2, 3]].values
  742.  
  743. y = dataset.iloc[:, 4].values
  744.  
  745. #       Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split
  746.  
  747. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
  748.  
  749. #       Feature Scaling
  750.  
  751. from sklearn.preprocessing import StandardScaler sc = StandardScaler()
  752. X_train = sc.fit_transform(X_train)
  753. X_test = sc.transform(X_test)
  754.  
  755. #       Fitting Random Forest Classification to the Training set from sklearn.ensemble import RandomForestClassifier
  756.  
  757. classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train)
  758.  
  759. #       Predicting the Test set results
  760.  
  761. y_pred = classifier.predict(X_test)
  762.  
  763. # Making the Confusion Matrix
  764.  
  765. from sklearn.metrics import confusion_matrix
  766. cm = confusion_matrix(y_test, y_pred)
  767.  
  768. # Visualising the Training set results
  769.  
  770. from matplotlib.colors import ListedColormap
  771. X_set, y_set = X_train, y_train
  772. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  773.  
  774. 0.01),
  775. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  776. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  777. plt.xlim(X1.min(), X1.max())
  778. plt.ylim(X2.min(), X2.max())
  779. for i, j in enumerate(np.unique(y_set)):
  780. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  781. c = ListedColormap(('red', 'green'))(i), label = j)
  782. plt.title('Random Forest Classification (Training set)')
  783. plt.xlabel('Age')
  784. plt.ylabel('Estimated Salary')
  785. plt.legend()
  786.  
  787. plt.show()
  788.  
  789. # Visualising the Test set results
  790.  
  791. from matplotlib.colors import ListedColormap
  792. X_set, y_set = X_test, y_test
  793. X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step =
  794. 0.01),
  795.  
  796. np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
  797. alpha = 0.75, cmap = ListedColormap(('red', 'green')))
  798. plt.xlim(X1.min(), X1.max())
  799. plt.ylim(X2.min(), X2.max())
  800. for i, j in enumerate(np.unique(y_set)):
  801. plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
  802. c = ListedColormap(('red', 'green'))(i), label = j)
  803. plt.title('Random Forest Classification (Test set)')
  804. plt.xlabel('Age')
  805. plt.ylabel('Estimated Salary')
  806. plt.legend()
  807.  
  808. plt.show()



  • Recent Pastes