Data analysis - numpy

Keywords: Python less Database angular

DIKW

DATA-->INFOMATION-->KNOWLEDGE-->WISDOM

Data - > Information - > Knowledge - > Wisdom

Crawler - > Database - > Data Analysis - > Machine Learning

  • Information: By organizing and processing data in some way, analyzing the relationship between data, data is meaningful.
  • Knowledge: If data is a collection of facts, conclusions about facts can be drawn from it. Knowledge, then, is a collection of information, which makes information useful. Knowledge is the application of information, a process of judging and confirming information, which combines experience, context, interpretation and reflection. Knowledge can answer "How?" Questions can help us to model and simulate
  • Wisdom: Wisdom can be simply summed up as the ability to make correct judgments and decisions, including the best use of knowledge. Wisdom can answer the question of "why". Back in the previous example, improvements can be identified based on the business impact of the failure on the customer

Mathematics

Calculus

1 # import math
2 # s = 0
3 # for i in range(1, 1001):
4 #     x = (math.pi / 1000) * i
5 #     y = math.sin((math.pi / 1000) * i)
6 #     s = (math.pi / 1000) * y + s
7 # print(s)

 

 1 # import numpy as np
 2 # def sin_integral(l,r,p):
 3 #     sum_result = 0
 4 #     delta = (r - l) / p
 5 #     for i in range(p):
 6 #         left = i * delta
 7 #         delta_area = delta * np.sin(left)
 8 #         sum_result += delta_area
 9 #     return sum_result
10 # print(sin_integral(0.0,np.pi,100000))

numpy

 1 # coding=utf-8
 2 import numpy as np
 3 import matplotlib.pyplot as pt
 4 
 5 # x Cubic power
 6 # X = np.linspace(-100, 100, 100)
 7 # Y = X * X * X
 8 
 9 # tan
10 # X = np.linspace(-np.pi//2,np.pi//2,1000)
11 # Y = np.tan(X)
12 
13 # log
14 # X = np.linspace(-10,10,100)
15 # Y = np.log(X)
16 #
17 # pt.plot(X, Y)
18 # pt.show()
19 
20 # -----------------------------------------------------------------------
21 
22 # Cocks and rabbits in the same cage
23 # for x in range(36):
24 #     y = 35 - x
25 #     if x+2*y == 47:
26 #         print(x,y)
27 
28 # sinx Area 0-pi
29 # import math
30 # s = 0
31 # for i in range(1, 1001):
32 #     x = (math.pi / 1000) * i
33 #     y = math.sin((math.pi / 1000) * i)
34 #     s = (math.pi / 1000) * y + s
35 # print(s)
36 # Inheritance encapsulated as a function
37 # import numpy as np
38 # def sin_integral(l,r,p):
39 #     sum_result = 0
40 #     delta = (r - l) / p
41 #     for i in range(p):
42 #         left = i * delta
43 #         delta_area = delta * np.sin(left)
44 #         sum_result += delta_area
45 #     return sum_result
46 # print(sin_integral(0.0,np.pi,100000))
47 
48 # --------------------------------------------------------
49 # a = np.arange(18).reshape(3, 6)     #Two-Dimensional Array Matrix
50 a = np.arange(24).reshape(2,3,4)     #3-D Array Matrix
51 # print a
52 # print a.ndim    #Matrix Dimension
53 # print np.ndim([[1,1],[2,2]])    #Matrix Dimension
54 # print a.dtype.name      #Numerical type int32
55 # print a.size    #Number of elements
56 # print a.itemsize    #Byte size of each array element
57 # print type(a)         #Types of a
58 
59 b = np.array([[1.2, 2, 3], [4, 5, 6]])
60 # print b.dtype       #Array of float64 type
61 
62 c = np.array([[1, 1], [2, 2]], dtype=complex)
63 # print c,c.dtype     #Array of complex type complex128
64 
65 z = np.zeros((3, 4))
66 # print z     #Create a full-zero array, default to float64
67 
68 o = np.ones((2, 3, 4), dtype=np.int16)
69 # print o
70 # Creating a three-dimensional array of all 1 and specifying the type at creation can be considered as a cube filled with 1 in an orderly manner.
71 #Two layers, each of which is a two-dimensional array of three rows and four columns
72 
73 e = np.empty((2,3))
74 # print e     #Create a two-dimensional empty array with different computer displays
75 
76 # f = np.arange(1,9,2)
77 f = np.arange(0,3,0.5)
78 # print f     #[13.57], 2 and 0.5 are step-by-step values
79 
80 # print np.arange(10000)  #If the array is too large to print all, NumPy automatically skips the central part and prints only the boundary part (head and tail).
81 # np.set_printoptions(threshold='nan')      #Disable this omission and force NumPy to print the entire array, using set_printopations to change the print options
82 # print np.arange(10000).reshape(100,100)
83 
84 a = np.array([20,30,40,50])
85 b = np.arange(4)
86 # print a-b           #subtract
87 # print b**2              #square
88 # print 10*np.sin(a)      #An array is first sin ed, and then multiplied by 10

operation

Factorial

np.math.factorial(100)

Logarithm

np.log()

Prescription

1. Prepare data representation for each condition 2. Logic of the preparer 3. Apply your data to logic 4. Optimize the structure

1 # np.sqrt(3)    
2 
3 # A = (2, 7)
4 # B = (8, 3)  # Euclidean distance
5 # AB = np.sqrt((A[0] - B[0]) ** 2 + (A[1] - B[1]) ** 2)
6 # print AB

trigonometric function

  • np.arctan()
  • np.cos()
  • np.sin()
  • np.rad2deg() - radian rotation angle
  • np.deg2rad-angular radian
  • ……
1 # x = np.array([3, 0]) + np.array([0, 3])
2 # x = np.array([3,3])
3 # l = np.linalg.norm(x)   #Norm (Length) of Vector x
4 # h = np.arctan(3.0/3.0)  #Calculating radian pi/4
5 # j = np.rad2deg(h)       #Radius turning angle 45 degrees
6 # np.deg2rad()          #radians
7 # print j

Dot multiplication

numpy array (vector) Default +-*/ Operations are all operations of elements corresponding to positions.

1 array1.dot(array2)
2 
3 # d1 = np.array([2, 7])
4 # d2 = np.array([8, 3])
5 # print d1.dot(d2)        #Point Multiplication (Inner Product) 2*8+7*3 Result: Real Number
 1 # Cosine Similarity, Vector Inner Product, Multiplication and Addition of Corresponding Elements
 2 '''
 3 Let two vectors be a=(x1,y1),b=(x2,y2),
 4 The angle between them isα,because ab=|a||b|cosα,
 5 therefore cosα=ab/|a||b|=(x1y1+x2,y2)/(Root number( x1^2+y1^2)Root number( x2^2+y1^2))
 6 '''
 7 # d12 = d1.dot(d2)                    #d1·d2
 8 # d1_len = np.linalg.norm(d1)         #|d1|
 9 # d2_len = np.linalg.norm(d2)         #|d2|
10 # cosa = d12 / (d1_len * d2_len)      #Cosine cosa
11 # a = np.rad2deg(np.arccos(cosa))     #Angle a
12 # print a

complex

1 # a = 1 + 2j              #Complex complex
2 # b = 2 + 3j             #Taylor series, Fourier series
3 # print a,type(a),a*b,a-b
1 # np.nan   #not a number occurs when data is read missing or computing exceptions occur, essentially a floating point number
2 # np.exp(10)  #Exponents based on e
3 # np.log(10)    #Logarithms based on e, ln
4 # np.e          #e,2.71828182
5 # np.inf          #Infinity

function

Empty array

The default value is zero or positive or negative infinity

Real numbers can only be approximated with infinite precision by floating-point numbers in computers, which can not be accurately expressed, so we should be very careful when dealing with zero; when A-B < 0.1e-10 is subtracted, the result is considered equal when it is less than a minimal value.

np.empty((3, 3))

array

Vectors are variables of direction and length, which can be expressed by numpy's multidigit array. A two-dimensional vector is a point in a plane.

 

1 np.array([[1,2,3],[4,5,6]])

norm

Norm of Vector (Length)

np.linalg.norm(np.array([3,3]))

Type conversion

1 array.astype(np.int)

Array information

1 array.shape
2 array.shape[0]
3 array.shape[1]
1 # By subtracting two vectors, the distance between two points can be calculated.
2 d1 = np.array([2, 7])
3 # d2 = np.array([8,3])
4 # np.linalg.norm(d1-d2)
5 # d1.astype(np.int)    #Mandatory conversion of array type to int
6 # d1.shape    #Number of rows and columns returned to an array
7 # d1.shape[0]     #Returns the number of rows in an array
8 # d1.shape[1]     #Returns the number of columns of an array

Equivalence

# np.linspace()

 

 1 # xs = np.linspace(-1000, 1000, 10000)
 2 # idx = []
 3 # max_result = []
 4 # for x in xs:
 5 #     y = -3 * (x ** 2) + 5 * x - 6
 6 #     idx.append(x)
 7 #     max_result.append(y)
 8 # print max(max_result),idx[max_result.index(max(max_result))]
 9 
10 # def poly_test(l,r):
11 #     r_len = r - l
12 #     max_num = l
13 #     m_idx = l
14 #     for i in range(r_len):
15 #         r_num = l + i
16 #         result = (r_num ** 2) * -3 + (5 * r_num) - 6
17 #         if result > max_num:
18 #             max_num = result
19 #             m_idx = i
20 #     return max_num,m_idx
21 # print poly_test(-10000,10000)
1 # stay X Axis generates 2000 slaves-10000 To 10,000 discrete points
2 # Vector calculation is used to directly generate all the results corresponding to the above polynomials. There is no cycle here, and 20,000 results are calculated at a time.
3 # X = np.linspace(-1000, 10000, 20000)
4 # Y = (X ** 2) * -3 + 5 * X - 6  # Vector operations, which are accelerated by computers
5 # Y.max()  # Get the maximum of the current vector
6 # Y.argmax()  # Gets the index corresponding to the maximum value of the current array (X, not X in the function)

Array slice

Two-dimensional array

1 n_array = np.arange(25).reshape(5, 5)
2 # print n_array      #First row, second column
3 # print n_array[:,:2]     #The first two columns
4 # print n_array[:3,:]         #The first three lines
5 # print n_array[1:4,1:4]      #1-3 rows and 1-3 columns
6 # print n_array[2,2]          #Number 3 in line 3
7 # print n_array[2][2]         #Ditto
8 # print n_array[::-2]   #Interlace selection
9 # print n_array[::2]

Three-dimensional array

1 n3_array = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [3, 2, 1]], [[6, 5, 4], [9, 8, 7]]])
2 # print n3_array          #First Number Selection Layer, Second Number Selection Row, Third Number Selection Column
3 # print n3_array[:,:,2]   #Last column
4 # print n3_array[:,:1,:].sum()    #The first line of each floor
5 # print n3_array[:1,:,:].mean()       #average value
6 # print n3_array[:,1,:2].std()        #Small variance, more stable

Array element selection

1 # d1 < 3                           #Returns a Boolean type matrix that satisfies the condition
2 # np.count_nonzero(d1 < 3)        #Number of elements less than 3 in the statistical array
3 # d1[d1<3]                         #Select the elements of the specified range

Case Study of Student Achievement

Data preparation

1 # score_array = np.loadtxt(open('score.csv', 'rb'), delimiter=',', dtype=int)
2 score_array = np.genfromtxt('score.csv', delimiter=',', dtype=int)
3 students = []
4 courses = ['Mathematics', 'Chinese', 'Chemistry', 'Geography', 'Music', 'Sports']

Course performance is the best

 1 def course_score():
 2     course_score_max = 0
 3     cid_max = -1
 4     for c in range(6):
 5         course_score = score_array[:, c].sum()
 6         print course_score
 7         if course_score_max < course_score:
 8             course_score_max = course_score
 9             cid_max = c
10     return courses[cid_max], course_score_max

 

Students get the best grades.

 1 def student_score():
 2     student_score_max = 0
 3     sid_max = -1
 4     for s in range(6):
 5         student_score = score_array[s, :].sum()
 6         print '{}Student No. 1:{}branch'.format(s, student_score)
 7         if student_score_max < student_score:
 8             student_score_max = student_score
 9             sid_max = s
10     return '{}No. 1 students get the best grades, with a total score{}branch'.format(sid_max, student_score_max)

Student's Partial Subject

 1 def pian():
 2     pian_max = 0
 3     pid_max = -1
 4     for p in range(6):
 5         student_score_std = score_array[p, :].std()
 6         print '{}The variance of No.{}'.format(p, student_score_std)
 7         if pian_max < student_score_std:
 8             pian_max = student_score_std
 9             pid_max = p
10     return '{}The variance of No. 1 students is:{}'.format(pid_max, pian_max)

The best grade in the main course

 1 def main_course_score():
 2     main_course_score_max = 0
 3     cid_max = -1
 4     for c in range(3):
 5         main_course_score = score_array[:, c].sum()
 6         print main_course_score
 7         if main_course_score_max < main_course_score:
 8             main_course_score_max = main_course_score
 9             cid_max = c
10     return cid_max, main_course_score_max

Which is better than the other in the class?

 1 def than():
 2     main_course_std = 0
 3     side_course_std = 0
 4     for t in range(3):
 5         main_course_std += score_array[:, t].std()
 6     main_course_std /= 3
 7     for t in range(3, 6):
 8         side_course_std += score_array[:, t].std()
 9     side_course_std /= 3
10     if main_course_std > side_course_std:
11         return 'The class did better in the main course.'
12     else:
13         return 'The grade of the class's associate course is better.'

How many students have failed in this class?

1 def bad():
2     badstudent = []
3     for b in range(6):
4         if min(score_array[b, :]) < 60:
5             badstudent.append(b)
6             # print '{}Students failed'.format(b)
7     return 'Failing students:{}'.format(badstudent)

Encapsulation into classes

 1 name_dic = {0: 'Mathematics', 1: 'Chinese', 2: 'Chemistry', 3: 'Geography', 4: 'Music', 5: 'Sports'}
 2 
 3 
 4 class CoursaDesc(object):
 5     def __init__(self):
 6         self.name = ''
 7         self.std = 0
 8         self.max = 0
 9         self.min = 0
10         self.mean = 0
11         self.num = 0
12 
13 
14 class ComputerDesc(object):
15     def __init__(self, n_array):
16         self.score_array = n_array
17         self.result = []
18 
19     def counter_all_coursa(self):
20         for i in range(6):
21             c_desc = CoursaDesc()
22             c_desc.name = name_dic[i]
23             c_desc.std = self.score_array[:, i].std()
24             c_desc.mean = self.score_array[:, i].mean()
25             c_desc.max = self.score_array[:, i].max()
26             c_desc.min = self.score_array[:, i].min()
27             c_desc.sum = self.score_array[:, i].sum()
28             self.result.append(c_desc)
29 
30     def best_coursa(self):
31         # std_list = [coursa.std for coursa in self.result]
32         # sum_list = [coursa.sum for coursa in self.result]
33         std_list = []
34         sum_list = []
35         for coursa in self.result:
36             std_list.append(coursa.std)
37             sum_list.append(coursa.sum)
38         std_array = np.array(std_list)
39         sum_array = np.array(sum_list)
40 
41         max_sum_coursa = sum_array.max()
42         max_sum_index = sum_array.argmax()
43 
44         min_std_coursa = std_array.min()
45         min_std_index = std_array.argmin()
46 
47         if max_sum_index == min_std_index:
48             return name_dic[max_sum_index]
49         else:
50             # The sum of the results of the course with the smallest variance
51             min_std_coursa_sum = sum_array[min_std_index]
52             # Variance of the courses with the highest total performance
53             max_sum_coursa_std = std_array[max_sum_index]
54 
55             sum_delta = max_sum_coursa - min_std_coursa_sum
56             std_delta = max_sum_coursa_std - min_std_coursa
57             sum_percent = sum_delta / max_sum_coursa
58             std_percent = std_delta / min_std_coursa
59             if sum_percent < 0.05 and std_percent > 0.2:
60                 return name_dic[min_std_index]
61 
62 if __name__ == '__main__':
63     c = ComputerDesc(score_array)
64     c.counter_all_coursa()
65     print c.best_coursa()

Posted by JovanLo on Sat, 26 Jan 2019 23:24:16 -0800