๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ
Data Engineering ์žฌ๋ฐŒ๋”ฐ/Python basic for data

ํŒŒ์ด์ฌ ๊ธฐ๋ณธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ผ๋Œ€์žฅ ; Numpy Pandas Matplotlib

by ์ž„๋ฆฌ๋‘ฅ์ ˆ 2024. 12. 3.
๋ฐ˜์‘ํ˜•
๋”๋ณด๊ธฐ

๊ธฐ๋ณธ์ ์ธ python ๊ฐœ๋…์„ ์•Œ๊ณ  ์žˆ๋‹ค๋Š” ์ „์ œ ํ•˜,
๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋“ค์„ ๊ฐ„๋‹จํ•˜๊ฒŒ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ๋“ค ์œ„์ฃผ๋กœ ๊ธฐ์–ตํ•˜๊ธฐ ์œ„ํ•ด ์ž‘์„ฑํ•˜๋Š” ๊ธ€์ž„์„ ์ฐธ๊ณ  ๋ถ€ํƒ๋“œ๋ฆฝ๋‹ˆ๋‹ค 


Pandas : ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์ฝ๊ธฐ 

์ด๋†ˆ์ด ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์ด๋‹ค

 

pandas๋กœ ํŒŒ์ผ ์–ด๋–ป๊ฒŒ ์ฝ๋‚˜์š”? 

๊ฐœ์ธ data/track_XY.txt ๋ฅผ ์‚ฌ์šฉ. ( ๊นƒํ—ˆ๋ธŒ์— ์žˆ๊ธดํ•œ๋ฐ ์ถ”ํ›„ ๊ณต๊ฐœ. ๋‹น์žฅ์€ ๋น„์Šทํ•˜๊ฒŒ ์ƒ๊ธด๋†ˆ ์ฐพ์œผ๋ฉด ๋˜๊ฒ ์๋‹ˆ๋‹ค )

๋งˆ์ง€๋ง‰์— ์š”์•ฝ์œผ๋กœ ํ”„๋ฆฐํŠธํ•ด์ค˜์„œ ๊ฒฐ๊ณผ๊นŒ์ง€ ํ™•์ธํ•œ๋‹ค.

, 
ํ‘œ์ค€
\t tab
\s+ ํ•˜๋‚˜ ์ด์ƒ์˜ ๊ณต๋ฐฑ
'' txt ํŒŒ์ผ์—์„œ ์ฃผ๋กœ ์‚ฌ์šฉ
r'\s+' column์ด space๋กœ ๊ตฌ๋ถ„๋˜์–ด ์žˆ๋Š” ๊ฒฝ์šฐ

 

import pandas as pd

# ํŒŒ์ผ ์ฝ๊ธฐ
track_data = "data/track_XY.txt"
df = pd.read_table(track_data,
                   sep="\s+",                                      # ๊ตฌ๋ถ„์ž
                   # , ํ‘œ์ค€ | \t tab | \s+ ํ•˜๋‚˜ ์ด์ƒ์˜ ๊ณต๋ฐฑ | ' ' txtํŒŒ์ผ์—์„œ ์ฃผ๋กœ ์‚ฌ์šฉ
                   encoding="euc-kr",                              # ์ธ์ฝ”๋”ฉ ํƒ€์ž…
                   names=['longitude', 'latitude'],                # ์ปฌ๋Ÿผ๋ช…
                   dtype={'longitude': float, 'latitude': float})  # ์ปฌ๋Ÿผ ๋ฐ์ดํ„ฐ ํƒ€์ž…

# ์ „์ฒด ํ‰๊ท , ์ตœ๋Œ€, ์ตœ์†Œ ๊ณ„์‚ฐ
summary = df.agg(['mean', 'max', 'min'])

# ๊ฒฐ๊ณผ ์ถœ๋ ฅ
print(summary)

๊ทธ๋ƒฅ ์ด๋ ‡๊ฒŒ ์ˆซ์ž ๋‘๊ฐœ ์จ์žˆ๋Š” ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ์•„๋ฌด๊ฑฐ๋‚˜ ๊ฐ€์ ธ์™€๋„ ๋œ๋‹ค.

pandas๋กœ ํŒŒ์ผ ์ €์žฅ ์–ด๋–ป๊ฒŒ?

๊ธฐ์กด euc-kr ์€ 8๋น„ํŠธ ๋ฌธ์ž ์ธ์ฝ”๋”ฉ์ด๊ณ  ๋Œ€ํ‘œ์ ์ธ ํ•œ๊ธ€ ์™„์„ฑํ˜• ์ธ์ฝ”๋”ฉ์ด๋ผ ํ•˜์ง€๋งŒ ์ตœ๊ทผ์€ ๋˜ ๋ฐ”๋€Œ์–ด utf-8๋กœ ๋ณ€๊ฒฝ.

utf-8-sig ์ธ์ฝ”๋”ฉ์„ ์‚ฌ์šฉํ•˜์—ฌ BOM(Byte Order Mark)์„ ํฌํ•จํ•  ์ˆ˜๋„ ์žˆ์Œ. (๋ญ ํฌ๊ฒŒ ์ƒ๊ด€์€ ์—†๋‹ค)

  • ์—ฌ๊ธฐ์„œ index = ํ–‰์ˆ˜ (0,1,2) column = header True / False๋กœ ํ‘œ์‹œ ํ•ฉ๋‹ˆ๋‹ค.
  • na_rep : ๊ฒฐ์ธก์น˜๊ฐ€ ์žˆ์„๊ฒฝ์šฐ ‘NaN’์œผ๋กœ ํ‘œ์‹œ ํ•œ๋‹ค๋Š” ๋œป
def write_pandas():
    friend_ordered_dict = OrderedDict([
        ('name',['John','Peter','pi']),
        ('age',[25,30,40]),
        ('weight',[75.3, 64.5, 3.141592653]),
        ('job',['student','์—ฐ๊ตฌ์›','๊ต์ˆ˜']),
    ])
    
    df = pd.DataFrame.from_dict(friend_ordered_dict)
    df.to_csv("data/save1.csv", 
              encoding="utf-8", 
              sep=' ',
              na_rep='NaN',
              float_format='%.2f', # 2 decimal places
              index=False, 
              header=True)
    # ์ธ์ฝ”๋”ฉ์„ UTF-8๋กœ ๋ณ€๊ฒฝํ•˜๊ณ  index (0,1,2) ์ œ๊ฑฐ

Numpy : ๊ณ„์‚ฐ ์ž˜ํ•˜๋Š” ๋†ˆ

vector์—ฐ์‚ฐ.

array ๋‹จ์œ„๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ๊ด€๋ฆฌํ•˜๊ณ  matrix ๊ณ„์‚ฐ.

  • Powerful N-dimensional array
  • Numerical Computing Tools
  • Performance

๊ฐ„๋‹จํ•˜๊ฒŒ perplexityํ•œํ…Œ ๋ถ€ํƒํ•ด์„œ ๋งŽ์ด ์“ฐ๋Š” ์ฝ”๋“œ๋ฅผ ๋ฐ›์•˜๋‹ค. 

์ฝ”๋“œ์— ์žˆ๋Š” ๊ฒƒ ์ฒ˜๋Ÿผ ๋ฐฐ์—ด์—ฐ์‚ฐ, ๋‹ค์ฐจ์› ๋ฐฐ์—ด, ์ˆ˜ํ•™ ํ•จ์ˆ˜ ์œ„์ฃผ๋กœ ์‚ฌ์šฉํ•œ๋‹ค. 

import numpy as np

# ๋ฐฐ์—ด ์ƒ์„ฑ
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.zeros((3, 3))
arr3 = np.ones((2, 2))
arr4 = np.arange(0, 10, 2) 
 
# ๋ฐฐ์—ด ์—ฐ์‚ฐ
result = arr1 + 5
squared = np.square(arr1)
sum_arr = np.sum(arr1)
mean_arr = np.mean(arr1)

# ๋ฐฐ์—ด ํ˜•ํƒœ ๋ณ€๊ฒฝ
reshaped = arr1.reshape((5, 1))

# ๋‚œ์ˆ˜ ์ƒ์„ฑ
random_arr = np.random.rand(3, 3)

# ๋ฐฐ์—ด ์ธ๋ฑ์‹ฑ
slice_arr = arr1[1:4]

print("Original array:", arr1)
print("Array + 5:", result)
print("Squared array:", squared)
print("Sum of array:", sum_arr)
print("Mean of array:", mean_arr)
print("Reshaped array:\n", reshaped)
print("Random array:\n", random_arr)
print("Sliced array:", slice_arr)

Matplotlib : ์‹œ๊ฐํ™” ํˆด 

  • ์‹œ๊ฐํ™” ํˆด.
  • basemap๋„ maplotlib ์ค‘ ํ•˜๋‚˜
def draw_graph():
    x = np.arange(1, 10)
    y = x * 3

    # 3๊ฐ€์ง€ ์‚ฐ์ ๋„ 
    plt.scatter(x, y, color='blue', marker='+', s=100)  # s = ๋งˆ์ปค ํฌ๊ธฐ
    plt.plot(x, y-2, linestyle=":", 
             color='blue', 
             marker='o',
             markeredgecolor='g',
             markerfacecolor='r'
             )
    plt.plot(x, y-5, linestyle="--", 
             color='black', 
             marker='s',
             markeredgecolor='m',
             markerfacecolor='c'
             )

    plt.show()

def color_graph():
    cmaps = plt.colormaps()  # Matplotlib์—์„œ ์ง€์›ํ•˜๋Š” ๋ชจ๋“  ์ƒ‰์ƒํ‘œ ๊ฐ€์ ธ์˜ค๊ธฐ
    for cmap in cmaps:       # ๋ฆฌ์ŠคํŠธ๋ฅผ ๋ฐ˜๋ณต
        print(cmap)
        
    data = np.random.rand(10, 10)  # ์˜ˆ์‹œ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
    
    plt.imshow(data, cmap='viridis')  # ์ถœ๋ ฅ
    plt.colorbar()
    plt.title('Example with Viridis Colormap')
    plt.show()

 

์‚ฐ์ ๋„๊ทธ๋ž˜ํ”„, ์ปฌ๋Ÿฌ๊ทธ๋ž˜ํ”„. ๋ฐ matplotlib ์ง€์› ์ƒ‰์ƒํ‘œ ๊ฐ€์ ธ์˜ค๊ธฐ

๊ธฐํƒ€ ์„ค์ • ๋ฐฉ๋ฒ•

  • ๊ฐ€๋กœ ๊ฒฉ์ž์„  plt.axhline(y=2000, color=’r’, linewidth=1) # y์ถ• 2000 ์— ๋ถ‰์€์ƒ‰๋ผ์ธ
  • ์„ธ๋กœ ๊ฒฉ์ž์„  plt.axvline(x=datetime(2016, 2, 17), color=’r’, linestyle=’- -’, linewidth=3)
  • ์‹ค์„  ‘-’, ‘solid’
  • ํŒŒ์„  ‘- -’, ‘dashed’
  • 1์  ์‡„์„  ‘-.’, ‘dashdot’
  • ์ ์„  ‘:’, ‘dotted’
  • ์•ˆ๊ทธ๋ฆผ ‘’, ‘ ‘

plt.subplot(row, column, index) (์„ธ๋กœ๊ธธ์ด, ๊ฐ€๋กœ๊ธธ์ด, ์ธ๋ฑ์Šค)

https://kongdols-room.tistory.com/98

def sub_graph():
    x1 = np.linspace(0.0, 5.0)
    x2 = np.linspace(0.0, 2.0)

    y1 = np.cos(2 * np.pi * x1) * np.exp(-x1)
    y2 = np.cos(2 * np.pi * x2)

    ax1 = plt.subplot(2, 1, 1)
    plt.plot(x1, y1, 'o-')
    plt.title('1st Graph')
    plt.ylabel('Damped oscillation')

    #plt.xticks(visible = False) # Hide x-ticks and labels
    ax2 = plt.subplot(2, 1, 2, sharex=ax1)  # Sharing x-axis with ax1
    plt.plot(x2, y2, '.-')
    plt.title('2nd Graph')
    plt.xlabel('time (s)')
    plt.ylabel('undamped')
    plt.tight_layout()  # Adjust layout to prevent overlap
    plt.show()

plt.subplot2grid((์ „์ฒดํฌ๊ธฐ), (ํ–‰, ์—ด), (ํ–‰์—ด๋ฐฉํ–ฅ์œผ๋กœ ํฌ๊ธฐ))

def grid_graph():
    plt.figure(figsize=(10, 12)) 
    ax1 = plt.subplot2grid((3,3), (0,0), colspan=3)   
    ax2 = plt.subplot2grid((3,3), (1,0), colspan=2)   
    ax3 = plt.subplot2grid((3,3), (1,2), rowspan=2)   
    ax4 = plt.subplot2grid((3,3), (2,0))   
    ax5 = plt.subplot2grid((3,3), (2,1))   
    plt.show()

๋„ˆ.. ๋„ˆ๋ฌด ๋Œ€์ถฉ๊ทธ๋ ธ๋‚˜? ์ฝ”๋“œ์ƒ์—์„œ ์œ„์น˜์™€ ๋น„๊ตํ•˜๋ฉฐ ํŒŒ์•…ํ•˜๋ฉด ์œ„์น˜, ํ–‰์—ด์•Œ๊ธฐ๊ฐ€ ํŽธํ•˜๋‹ค.

๋ฒ”๋ก€ legend ์„ค์ •

plt.legend(handles = [ variable_x, variable_y, variable_z ],  # ํ‘œ์‹œํ•  ๊ฐ์ฒด๋“ค๋ฆฌ์ŠคํŠธ
                      loc = ‘best’,     # ์œ„์น˜ ์ง€์ •. upper left, upper right, lower left, center left
                      frameon = True,   # ๋ฒ”๋ก€ ์ฃผ์œ„ ํ…Œ๋‘๋ฆฌ ํ”„๋ ˆ์ž„. 
                      fontsize = 10, 
                      facecolor = ‘lightgrey’,  # ๋ฒ”๋ก€์˜ ๋ฐฐ๊ฒฝ ์ƒ‰์ƒ 
                      labelcoor = ‘black’)

+ Histogram 

Hisogram ๊ฐœ๋…๊ณผ ์‘์šฉ

  • count 16.000000
    mean 3.593750
    std 1.280869
    min 1.000000
    25% 2.875000
    50% 4.000000
    75% 5.000000
    max 5.000000
    Name: values, dtype: float64
def histogram():
    data = {'values' : [1, 2, 2, 2.5, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]}
    df = pd.DataFrame(data)
    # make histogram
    plt.hist(df['values'], color= 'y', edgecolor = 'white', alpha = 0.7, bins = 5)
    # ์ƒ‰, ํˆฌ๋ช…๋„, ๊ฒฝ๊ณ„, ๊ตฌ๊ฐ„ ์ง€์ • (0.7์”ฉ 5๊ฐœ๋กœ ํ‘œํ˜„)
    plt.title('Histogram Example')
    plt.xlabel('Values')
    plt.ylabel('Counts')

    print(df['values'].describe())
    plt.show()

histogram ๊ตฌ๊ฐ„์— ๋Œ€ํ•œ ์ดํ•ด

counts, bins, patches
๊ฐ ๊ตฌ๊ฐ„ ๋ฐ์ดํ„ฐ ๊ฐœ์ˆ˜, ๊ตฌ๊ฐ„ ๊ฒฝ๊ณ„๊ฐ’ ์ •์˜, ํžˆ์Šคํ† ๊ทธ๋žจ ๋ง‰๋Œ€์— ๋Œ€ํ•œ ๊ฐ์ฒด ๋ฆฌ์ŠคํŠธ
์ตœ์†Ÿ๊ฐ’, ์ตœ๋Œ“๊ฐ’ +๊ฐ„๊ฒฉ , ๊ฐ„๊ฒฉ์”ฉ

def histogram2():
    data = np.random.normal(7, 3, 100)

    # ๊ฐ ๊ตฌ๊ฐ„ ๋ฐ์ดํ„ฐ ๊ฐœ์ˆ˜, ๊ตฌ๊ฐ„ ๊ฒฝ๊ณ„๊ฐ’ ์ •์˜, ํžˆ์Šคํ† ๊ทธ๋žจ ๋ง‰๋Œ€์— ๋Œ€ํ•œ ๊ฐ์ฒด ๋ฆฌ์ŠคํŠธ 
    bin_interval = 2
    #counts, bins, patches = plt.hist(data, bins=[0,3,6,9,12,15], edgecolor = 'black')
    counts, bins, patches = plt.hist(data, 
                                     bins=np.arange(min(data), max(data) + bin_interval, bin_interval), 
                                     edgecolor = 'black')
    plt.xlabel("bins", fontsize = 12)
    plt.ylabel("count", fontsize = 12)

    for i in range(len(counts)):
        print(f"๊ตฌ๊ฐ„ {bins[i]:.2f}{bins[i+1]:.2f}: {counts[i]}๊ฐœ")

    plt.tight_layout()
    plt.show()

+ numpy ceil, floor ์†Œ์ˆซ์  ์˜ฌ๋ฆผ๊ณผ ๋‚ด๋ฆผ / min, max ์ด์šฉ

def mathwithhist():
    data = np.random.normal(7, 3, 100)

    # data point ์— ๋Œ€ํ•œ ๋‚ด๋ฆผ, ์˜ฌ๋ฆผ ๊ณ„์‚ฐ / min, max ๊ณ„์‚ฐ 
    floored_data = np.floor(data)
    ceiled_data = np.ceil(data)
    min_val = np.min(floored_data)
    max_val = np.max(ceiled_data)

    bin_interval = 2
    #counts, bins, patches = plt.hist(data, bins=[0,3,6,9,12,15], edgecolor = 'black')
    counts, bins, patches = plt.hist(data, 
                                     bins=np.arange(min_val, max_val + bin_interval, bin_interval), 
                                     edgecolor = 'black')
    plt.xlabel("bins", fontsize = 12)
    plt.ylabel("count", fontsize = 12)
    plt.grid(True) # add grid line

    for i in range(len(counts)):
        print(f"๊ตฌ๊ฐ„ {bins[i]:.2f}{bins[i+1]:.2f}: {counts[i]}๊ฐœ")

    plt.tight_layout()
    plt.show()

๋žœ๋คํ•˜๋ฉด์„œ ๊ทธ ๊ตฌ๊ฐ„์ด ๋ฐ”๋กœ ์žกํ˜€ ๊ฐ„๊ฒฉ์ด ์•Œ์•„์„œ ๋‚˜์˜ค๋Š” ๊ฒŒ point ~^^


์ถ”๊ฐ€ ์ฐธ๊ณ  ๋งํฌ 

๋”๋ณด๊ธฐ

subplot ์ธ์ž, ์˜ต์…˜๋“ค ์ƒ์„ธ ์‚ฌํ•ญ https://kongdols-room.tistory.com/98

 

๋ฐ˜์‘ํ˜•

์ตœ๊ทผ๋Œ“๊ธ€

์ตœ๊ทผ๊ธ€

skin by ยฉ 2024 ttutta