Skr Eric's data analysis class

Keywords: encoding Programming

Common functions of Numpy

1. Read matrix file

xxx,xxx,xxx,xxx

xxx,xxx,xxx,xxx

xxx,xxx,xxx,xxx

It consists of data items in several rows and columns. The number of items in each row must be equal, the type of data items in each column must be the same, and there is a clear separator between data items.

np.loadtxt(

File path,

Delimiter = delimiter string,

usecols = select column set,

unpack = expand by column (default False),

dtype = target type (default float),

converters = converter Dictionary) - >

A two-dimensional (unpack=False) or multiple one-dimensional arrays (unpack=True)

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


# Converter function to change day in day month year format to year month day format
def dmy2ymd(dmy):
        # Convert UTF-8 encoded byte string to UCS-4 encoded string
    dmy = str(dmy, encoding='utf-8')
    '''
    d, m, y = dmy.split('-')
    ymd = y + "-" + m + "-" + d
    '''
    # Resolve date string in day month year format to datetime
    # Object of type, and then the date sub object of type date
    date = dt.datetime.strptime(
        dmy, '%d-%m-%Y').date()
    # Format date object of type date
    # String in the form of year month day
    ymd = date.strftime('%Y-%m-%d')
    return ymd


# Read Apple's
# Stock price: opening price, highest price, lowest price and closing price
dates, opening_prices, highest_prices, \
    lowest_prices, closing_prices = np.loadtxt(
        '../../data/aapl.csv', delimiter=",",
        usecols=(1, 3, 4, 5, 6), unpack=True,
        dtype='M8[D], f8, f8, f8, f8',
        converters={1: dmy2ymd})
mp.figure('Candlestick', facecolor='lightgray')
mp.title('Candlestick', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# Major scale represents Monday of each week
ax.xaxis.set_major_locator(
    md.WeekdayLocator(byweekday=md.MO))
# Sub scale means every day
ax.xaxis.set_minor_locator(md.DayLocator())
# Set label format of major scale: day month (abbreviated in English) year
ax.xaxis.set_major_formatter(
    md.DateFormatter('%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(axis='y', linestyle=':')
# Numpy.datetime64[D]->
#     Matplotlib.dates.datetime.datetime
dates = dates.astype(md.datetime.datetime)
rise = closing_prices - opening_prices >= 0.01
fall = opening_prices - closing_prices >= 0.01
fc = np.zeros(dates.size, dtype='3f4')
ec = np.zeros(dates.size, dtype='3f4')
fc[rise], fc[fall] = (1, 1, 1), (0, 0.5, 0)
ec[rise], ec[fall] = (1, 0, 0), (0, 0.5, 0)
mp.bar(dates, highest_prices - lowest_prices,
       0, lowest_prices, color=fc, edgecolor=ec)
mp.bar(dates, closing_prices - opening_prices,
       0.8, opening_prices, color=fc, edgecolor=ec)
mp.gcf().autofmt_xdate()
mp.show()

Mask mask array

 

2. Arithmetic mean

Sample: S = [S1, S2,..., Sn]

Arithmetic mean: m = (s1+s2+...+sn)/n

s1 = s + d1

s2 = s + d2

...

sn = s + dn

m = s + (d1+d2+...+dn)/n

n->oo: (d1+d2+...+dn)/n->0

The arithmetic mean is the unbiased estimation of the true value when the number of samples is enough.

Np.mean (sample array) - > arithmetic mean

Sample array. Mean() - > arithmetic mean

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(6), unpack=True)
mean = 0
for closing_price in closing_prices:
    mean += closing_price
mean /= closing_prices.size
print(mean)
mean = np.mean(closing_prices)
print(mean)
mean = closing_prices.mean()
print(mean)

 

3. Weighted average

Sample: S = [S1, S2,..., Sn]

Weight: w = [W1, W2,..., WN]

weighted average:

a = (s1w1+s2w2+...+snwn)/(w1+w2+...+wn)

The arithmetic mean is the weighted mean with equal weight

NP. Average (sample array, weights = weight array)

- > weighted average

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices, volumes = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(6, 7), unpack=True)
vwap, wsum = 0, 0
for closing_price, volume in zip(
        closing_prices, volumes):
    vwap += closing_price * volume
    wsum += volume
vwap /= wsum
print(vwap)
vwap = np.average(closing_prices, weights=volumes)
print(vwap)

Time: early -------- > late

Price: 10... 52 48 51 50

Weight: low -------- > High

                                           ?

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np


def dmy2days(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
        dmy, '%d-%m-%Y').date()
    days = (date - dt.date.min).days
    return days


days, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    converters={1: dmy2days})
twap, wsum = 0, 0
for closing_price, day in zip(
        closing_prices, days):
    twap += closing_price * day
    wsum += day
twap /= wsum
print(twap)
twap = np.average(closing_prices, weights=days)
print(twap)

Time - > value

 

4. best value

np.max() \ find the most in an array

np.min() / high value or minimum value element

np.argmax() \ find the most in an array

np.argmin() / large or minimum subscript

np.maximum() \ the maximum value of corresponding positions in two arrays

np.minimum() / or minimum values are collected into a new array

np.ptp() - the range of an array -- the difference between the maximum element and the minimum element

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
# Generating 9 random numbers in [10, 100) interval which obey uniform distribution
a = np.random.randint(10, 100, 9).reshape(3, 3)
print(a)
b, c = np.max(a), np.min(a)
print(b, c)
d, e = np.argmax(a), np.argmin(a)
print(d, e)
names = np.array(['zhangfei', 'zhaoyun', 'guanyu'])
scores = np.array([70, 90, 80])
print(names[np.argmax(scores)])
f = np.random.randint(10, 100, 9).reshape(3, 3)
print(f)
g, h = np.maximum(a, f), np.minimum(a, f)
print(g, h, sep='\n')
i = np.ptp(a)
print(i)

Price fluctuation range

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
highest_prices, lowest_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(4, 5), unpack=True)
max_highest_price, min_lowest_price = \
    highest_prices[0], lowest_prices[0]
for highest_price, lowest_price in zip(
        highest_prices[1:], lowest_prices[1:]):
    if max_highest_price < highest_price:
        max_highest_price = highest_price
    if min_lowest_price > lowest_price:
        min_lowest_price = lowest_price
print(max_highest_price - min_lowest_price)
print(np.max(highest_prices) - np.min(lowest_prices))

Price fluctuation range

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
highest_prices, lowest_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(4, 5), unpack=True)
max_highest_price, min_highest_price, \
    max_lowest_price, min_lowest_price = \
    highest_prices[0], highest_prices[0], \
    lowest_prices[0], lowest_prices[0]
for highest_price, lowest_price in zip(
        highest_prices[1:], lowest_prices[1:]):
    if max_highest_price < highest_price:
        max_highest_price = highest_price
    if min_highest_price > highest_price:
        min_highest_price = highest_price
    if max_lowest_price < lowest_price:
        max_lowest_price = lowest_price
    if min_lowest_price > lowest_price:
        min_lowest_price = lowest_price
print(max_highest_price - min_highest_price,
      max_lowest_price - min_lowest_price)
print(np.ptp(highest_prices), np.ptp(lowest_prices))

 

5. median

5000 3000 4000 6000 1 10000000000

1 3000 4000 5000 6000 10000000000

                 \____/

                     |

                 4500

(a[(6-1)/2] + a[6/2]) / 2

1 3000 4000 5000 10000000000

                 |

             4000

(a[(5-1)/2] + a[5/2]) / 2

General formula: (a [(L-1) / 2] + a [L / 2] / 2

Np.median (array) - > median

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(6), unpack=True)
sorted_prices = np.msort(closing_prices)
l = len(sorted_prices)
median = (sorted_prices[int((l - 1) / 2)] +
          sorted_prices[int(l / 2)]) / 2
print(median)
median = np.median(closing_prices)
print(median)

 

6. standard deviation

Sample: S = [S1, S2,..., Sn]

Mean value: M = (S1 + S2 +... + SN) / N - > true value

Dispersion: D = [D1, D2,..., DN], di = Si - M

Deviation square: q = [Q1, Q2,..., QN], Qi = Di ^ 2

(population) variance: v = (q1+q2+...+qn)/n

(overall) standard deviation: STD = sqrt (V) - > root mean square error, indicating the deviation degree of all samples from the true value. As an indicator of the dispersion of a set of random quantities

(sample) variance: V '= (Q1 + Q2 +... + QN) / (n-1)

(sample) standard deviation: STD '= sqrt (V')

Np.std (sample array, ddof = non degree of freedom (default 0)) - > standard deviation

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(6), unpack=True)
# mean value
mean = closing_prices.mean()
# Deviation
devs = closing_prices - mean
# Population variance
pvar = (devs ** 2).sum() / devs.size
# Overall standard deviation
pstd = np.sqrt(pvar)
# Sample variance
svar = (devs ** 2).sum() / (devs.size - 1)
# Sample standard deviation
sstd = np.sqrt(svar)
print(pstd, sstd)
pstd = np.std(closing_prices)
sstd = np.std(closing_prices, ddof=1)
print(pstd, sstd)

 

7. Weekly data

Mon Tue Wed Thu Fri

 xxx  xxx  xxx   xxx  xxx

 xxx  xxx  xxx   xxx  xxx

...

Np.where (condition) - > the subscript array of elements in the array that meet the condition

NP. Take (array, subscript array) - > a sub array of elements in an array corresponding to a subscript array

Array [mask array] - > the subarray of the elements in the array corresponding to the True element of the mask array

Calculate weekly average

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np


def dmy2wday(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
        dmy, '%d-%m-%Y').date()
    wday = date.weekday()  # 0-6 for Monday to Sunday
    return wday


wdays, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    converters={1: dmy2wday})
ave_closing_prices = np.zeros(5)
for wday in range(len(ave_closing_prices)):
    '''
    ave_closing_prices[wday] = np.take(
        closing_prices,
        np.where(wdays == wday)).mean()
    ave_closing_prices[wday] = closing_prices[
        np.where(wdays == wday)].mean()
    '''
    ave_closing_prices[wday] = closing_prices[
        wdays == wday].mean()
for wday, ave_closing_price in zip(
        ['MON', 'TUE', 'WED', 'THU', 'FRI'],
        ave_closing_prices):
    print(wday, np.round(ave_closing_price, 2))

NP. Apply along axis

The n-dimensional array is divided into several n-1-dimensional sub arrays according to the given axial direction as parameters to call the processing function, and its return value is recombined into an array to return

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np


def foo(a):
    return a.sum()


a = np.arange(1, 10).reshape(3, 3)
print(a)
b = np.apply_along_axis(foo, 0, a)
print(b)
c = np.apply_along_axis(foo, 1, a)
print(c)

Statistics of weekly opening and closing prices

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np


def dmy2wday(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
        dmy, '%d-%m-%Y').date()
    wday = date.weekday()  # 0-6 for Monday to Sunday
    return wday


wdays, opening_prices, highest_prices, \
    lowest_prices, closing_prices = np.loadtxt(
        '../../data/aapl.csv', delimiter=',',
        usecols=(1, 3, 4, 5, 6), unpack=True,
        converters={1: dmy2wday})
wdays = wdays[:16]
opening_prices = opening_prices[:16]
highest_prices = highest_prices[:16]
lowest_prices = lowest_prices[:16]
closing_prices = closing_prices[:16]
first_monday = np.where(wdays == 0)[0][0]
last_friday = np.where(wdays == 4)[0][-1]
indices = np.arange(first_monday, last_friday + 1)
indices = np.array(np.split(indices, 3))


def week_summary(indices):
    opening_price = opening_prices[indices[0]]
    highest_price = np.take(
        highest_prices, indices).max()
    lowest_price = np.take(
        lowest_prices, indices).min()
    closing_price = closing_prices[indices[-1]]
    return opening_price, highest_price, \
        lowest_price, closing_price


summaries = np.apply_along_axis(
    week_summary, 1, indices)
np.savetxt('../../data/summary.csv',
           summaries, delimiter=',', fmt='%g')

 

8. One dimensional convolution

a = [1 2 3 4 5]

b = [6 7 8]

c = a @ b = [6 19 40 61 82 67 40] - full convolution

[19 40 61 82 67] - same dimensional convolution (same)

[40 61 82] - valid convolution

            6   19  40  61  82 67 40

0    0    1    2    3    4    5   0    0

8    7    6

      8    7    6

            8    7    6

                  8    7    6

                        8    7    6

                              8    7    6

                                    8    7    6

c = np.convolve(a, b, ['full']/'same'/'valid')

                             ^ ^                   ^

                              |  |____              |

Convolution type of convoluted product group

Convolution kernel array

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
a = np.array([1, 2, 3, 4, 5])
b = np.array([6, 7, 8])
print(np.convolve(a, b))
print(np.convolve(a, b, 'same'))
print(np.convolve(a, b, 'valid'))

 

9. Moving average

a b c d e f g h i j k l m n

^^^ ^^

[1/5 1/5 1/5 1/5 1/5]

A B C D E -> S=A+B+C+D+E

(aA + bB + cC + dD +eE)/S

aA/S + bB/S +cC/S + dD/S + eE/S

[A/S B/S C/S D/S E/S]

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md


# Converter function to change day in day month year format to year month day format
def dmy2ymd(dmy):
        # Convert UTF-8 encoded byte string to UCS-4 encoded string
    dmy = str(dmy, encoding='utf-8')
    '''
    d, m, y = dmy.split('-')
    ymd = y + "-" + m + "-" + d
    '''
    # Resolve date string in day month year format to datetime
    # Object of type, and then the date sub object of type date
    date = dt.datetime.strptime(
        dmy, '%d-%m-%Y').date()
    # Format date object of type date
    # String in the form of year month day
    ymd = date.strftime('%Y-%m-%d')
    return ymd


# Read Apple's
# Stock price: opening price, highest price, lowest price and closing price
dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=",",
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
sma51 = np.zeros(closing_prices.size - 4)
for i in range(sma51.size):
    sma51[i] = closing_prices[i:i + 5].mean()
sma52 = np.convolve(closing_prices,
                    np.ones(5) / 5, 'valid')
sma10 = np.convolve(closing_prices,
                    np.ones(10) / 10, 'valid')
weights = np.exp(np.linspace(-1, 0, 5))
weights /= weights.sum()
ema5 = np.convolve(closing_prices,
                   weights[::-1], 'valid')
mp.figure('Moving Average', facecolor='lightgray')
mp.title('Moving Average', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# Major scale represents Monday of each week
ax.xaxis.set_major_locator(
    md.WeekdayLocator(byweekday=md.MO))
# Sub scale means every day
ax.xaxis.set_minor_locator(md.DayLocator())
# Set label format of major scale: day month (abbreviated in English) year
ax.xaxis.set_major_formatter(
    md.DateFormatter('%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
# Numpy.datetime64[D]->
#     Matplotlib.dates.datetime.datetime
dates = dates.astype(md.datetime.datetime)
mp.plot(dates, closing_prices, c='lightgray',
        label='Closing Price')
mp.plot(dates[4:], sma51, c='orangered',
        label='SMA-51')
mp.plot(dates[4:], sma52, c='orangered', alpha=0.3,
        linewidth=6, label='SMA-52')
mp.plot(dates[9:], sma10, c='dodgerblue',
        label='SMA-10')
mp.plot(dates[4:], ema5, c='limegreen',
        label='EMA-5')
mp.legend()
mp.gcf().autofmt_xdate()
mp.show()

 

 

 

 

For more courses, please follow SkrEric's programming class on wechat

95 original articles published, praised 152, visited 10000+
Private letter follow

Posted by chwebdesigns on Wed, 15 Jan 2020 03:25:19 -0800