๊ธฐ๋ณธ์ ์ธ python ๊ฐ๋
์ ์๊ณ ์๋ค๋ ์ ์ ํ,
๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ค์ ๊ฐ๋จํ๊ฒ ์ฌ์ฉํ๋ ๊ฒ๋ค ์์ฃผ๋ก ๊ธฐ์ตํ๊ธฐ ์ํด ์์ฑํ๋ ๊ธ์์ ์ฐธ๊ณ ๋ถํ๋๋ฆฝ๋๋ค
Pandas : ๋ฐ์ดํฐ ํ๋ ์ ์ฝ๊ธฐ
pandas๋ก ํ์ผ ์ด๋ป๊ฒ ์ฝ๋์?
๊ฐ์ธ data/track_XY.txt ๋ฅผ ์ฌ์ฉ. ( ๊นํ๋ธ์ ์๊ธดํ๋ฐ ์ถํ ๊ณต๊ฐ. ๋น์ฅ์ ๋น์ทํ๊ฒ ์๊ธด๋ ์ฐพ์ผ๋ฉด ๋๊ฒ ์๋๋ค )
๋ง์ง๋ง์ ์์ฝ์ผ๋ก ํ๋ฆฐํธํด์ค์ ๊ฒฐ๊ณผ๊น์ง ํ์ธํ๋ค.
, |
ํ์ค |
\t | tab |
\s+ | ํ๋ ์ด์์ ๊ณต๋ฐฑ |
'' | txt ํ์ผ์์ ์ฃผ๋ก ์ฌ์ฉ |
r'\s+' | column์ด space๋ก ๊ตฌ๋ถ๋์ด ์๋ ๊ฒฝ์ฐ |
import pandas as pd
# ํ์ผ ์ฝ๊ธฐ
track_data = "data/track_XY.txt"
df = pd.read_table(track_data,
sep="\s+", # ๊ตฌ๋ถ์
# , ํ์ค | \t tab | \s+ ํ๋ ์ด์์ ๊ณต๋ฐฑ | ' ' txtํ์ผ์์ ์ฃผ๋ก ์ฌ์ฉ
encoding="euc-kr", # ์ธ์ฝ๋ฉ ํ์
names=['longitude', 'latitude'], # ์ปฌ๋ผ๋ช
dtype={'longitude': float, 'latitude': float}) # ์ปฌ๋ผ ๋ฐ์ดํฐ ํ์
# ์ ์ฒด ํ๊ท , ์ต๋, ์ต์ ๊ณ์ฐ
summary = df.agg(['mean', 'max', 'min'])
# ๊ฒฐ๊ณผ ์ถ๋ ฅ
print(summary)
pandas๋ก ํ์ผ ์ ์ฅ ์ด๋ป๊ฒ?
๊ธฐ์กด euc-kr ์ 8๋นํธ ๋ฌธ์ ์ธ์ฝ๋ฉ์ด๊ณ ๋ํ์ ์ธ ํ๊ธ ์์ฑํ ์ธ์ฝ๋ฉ์ด๋ผ ํ์ง๋ง ์ต๊ทผ์ ๋ ๋ฐ๋์ด utf-8๋ก ๋ณ๊ฒฝ.
utf-8-sig ์ธ์ฝ๋ฉ์ ์ฌ์ฉํ์ฌ BOM(Byte Order Mark)์ ํฌํจํ ์๋ ์์. (๋ญ ํฌ๊ฒ ์๊ด์ ์๋ค)
- ์ฌ๊ธฐ์ index = ํ์ (0,1,2) column = header True / False๋ก ํ์ ํฉ๋๋ค.
- na_rep : ๊ฒฐ์ธก์น๊ฐ ์์๊ฒฝ์ฐ ‘NaN’์ผ๋ก ํ์ ํ๋ค๋ ๋ป
def write_pandas():
friend_ordered_dict = OrderedDict([
('name',['John','Peter','pi']),
('age',[25,30,40]),
('weight',[75.3, 64.5, 3.141592653]),
('job',['student','์ฐ๊ตฌ์','๊ต์']),
])
df = pd.DataFrame.from_dict(friend_ordered_dict)
df.to_csv("data/save1.csv",
encoding="utf-8",
sep=' ',
na_rep='NaN',
float_format='%.2f', # 2 decimal places
index=False,
header=True)
# ์ธ์ฝ๋ฉ์ UTF-8๋ก ๋ณ๊ฒฝํ๊ณ index (0,1,2) ์ ๊ฑฐ
Numpy : ๊ณ์ฐ ์ํ๋ ๋
vector์ฐ์ฐ.
array ๋จ์๋ก ๋ฐ์ดํฐ๋ฅผ ๊ด๋ฆฌํ๊ณ matrix ๊ณ์ฐ.
- Powerful N-dimensional array
- Numerical Computing Tools
- Performance
๊ฐ๋จํ๊ฒ perplexityํํ ๋ถํํด์ ๋ง์ด ์ฐ๋ ์ฝ๋๋ฅผ ๋ฐ์๋ค.
์ฝ๋์ ์๋ ๊ฒ ์ฒ๋ผ ๋ฐฐ์ด์ฐ์ฐ, ๋ค์ฐจ์ ๋ฐฐ์ด, ์ํ ํจ์ ์์ฃผ๋ก ์ฌ์ฉํ๋ค.
import numpy as np
# ๋ฐฐ์ด ์์ฑ
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.zeros((3, 3))
arr3 = np.ones((2, 2))
arr4 = np.arange(0, 10, 2)
# ๋ฐฐ์ด ์ฐ์ฐ
result = arr1 + 5
squared = np.square(arr1)
sum_arr = np.sum(arr1)
mean_arr = np.mean(arr1)
# ๋ฐฐ์ด ํํ ๋ณ๊ฒฝ
reshaped = arr1.reshape((5, 1))
# ๋์ ์์ฑ
random_arr = np.random.rand(3, 3)
# ๋ฐฐ์ด ์ธ๋ฑ์ฑ
slice_arr = arr1[1:4]
print("Original array:", arr1)
print("Array + 5:", result)
print("Squared array:", squared)
print("Sum of array:", sum_arr)
print("Mean of array:", mean_arr)
print("Reshaped array:\n", reshaped)
print("Random array:\n", random_arr)
print("Sliced array:", slice_arr)
Matplotlib : ์๊ฐํ ํด
- ์๊ฐํ ํด.
- basemap๋ maplotlib ์ค ํ๋
def draw_graph():
x = np.arange(1, 10)
y = x * 3
# 3๊ฐ์ง ์ฐ์ ๋
plt.scatter(x, y, color='blue', marker='+', s=100) # s = ๋ง์ปค ํฌ๊ธฐ
plt.plot(x, y-2, linestyle=":",
color='blue',
marker='o',
markeredgecolor='g',
markerfacecolor='r'
)
plt.plot(x, y-5, linestyle="--",
color='black',
marker='s',
markeredgecolor='m',
markerfacecolor='c'
)
plt.show()
def color_graph():
cmaps = plt.colormaps() # Matplotlib์์ ์ง์ํ๋ ๋ชจ๋ ์์ํ ๊ฐ์ ธ์ค๊ธฐ
for cmap in cmaps: # ๋ฆฌ์คํธ๋ฅผ ๋ฐ๋ณต
print(cmap)
data = np.random.rand(10, 10) # ์์ ๋ฐ์ดํฐ ์์ฑ
plt.imshow(data, cmap='viridis') # ์ถ๋ ฅ
plt.colorbar()
plt.title('Example with Viridis Colormap')
plt.show()
๊ธฐํ ์ค์ ๋ฐฉ๋ฒ
- ๊ฐ๋ก ๊ฒฉ์์ plt.axhline(y=2000, color=’r’, linewidth=1) # y์ถ 2000 ์ ๋ถ์์๋ผ์ธ
- ์ธ๋ก ๊ฒฉ์์ plt.axvline(x=datetime(2016, 2, 17), color=’r’, linestyle=’- -’, linewidth=3)
- ์ค์ ‘-’, ‘solid’
- ํ์ ‘- -’, ‘dashed’
- 1์ ์์ ‘-.’, ‘dashdot’
- ์ ์ ‘:’, ‘dotted’
- ์๊ทธ๋ฆผ ‘’, ‘ ‘
plt.subplot(row, column, index) (์ธ๋ก๊ธธ์ด, ๊ฐ๋ก๊ธธ์ด, ์ธ๋ฑ์ค)
https://kongdols-room.tistory.com/98
def sub_graph():
x1 = np.linspace(0.0, 5.0)
x2 = np.linspace(0.0, 2.0)
y1 = np.cos(2 * np.pi * x1) * np.exp(-x1)
y2 = np.cos(2 * np.pi * x2)
ax1 = plt.subplot(2, 1, 1)
plt.plot(x1, y1, 'o-')
plt.title('1st Graph')
plt.ylabel('Damped oscillation')
#plt.xticks(visible = False) # Hide x-ticks and labels
ax2 = plt.subplot(2, 1, 2, sharex=ax1) # Sharing x-axis with ax1
plt.plot(x2, y2, '.-')
plt.title('2nd Graph')
plt.xlabel('time (s)')
plt.ylabel('undamped')
plt.tight_layout() # Adjust layout to prevent overlap
plt.show()
plt.subplot2grid((์ ์ฒดํฌ๊ธฐ), (ํ, ์ด), (ํ์ด๋ฐฉํฅ์ผ๋ก ํฌ๊ธฐ))
def grid_graph():
plt.figure(figsize=(10, 12))
ax1 = plt.subplot2grid((3,3), (0,0), colspan=3)
ax2 = plt.subplot2grid((3,3), (1,0), colspan=2)
ax3 = plt.subplot2grid((3,3), (1,2), rowspan=2)
ax4 = plt.subplot2grid((3,3), (2,0))
ax5 = plt.subplot2grid((3,3), (2,1))
plt.show()
๋ฒ๋ก legend ์ค์
plt.legend(handles = [ variable_x, variable_y, variable_z ], # ํ์ํ ๊ฐ์ฒด๋ค๋ฆฌ์คํธ
loc = ‘best’, # ์์น ์ง์ . upper left, upper right, lower left, center left
frameon = True, # ๋ฒ๋ก ์ฃผ์ ํ
๋๋ฆฌ ํ๋ ์.
fontsize = 10,
facecolor = ‘lightgrey’, # ๋ฒ๋ก์ ๋ฐฐ๊ฒฝ ์์
labelcoor = ‘black’)
+ Histogram
Hisogram ๊ฐ๋ ๊ณผ ์์ฉ
- count 16.000000
mean 3.593750
std 1.280869
min 1.000000
25% 2.875000
50% 4.000000
75% 5.000000
max 5.000000
Name: values, dtype: float64
def histogram():
data = {'values' : [1, 2, 2, 2.5, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]}
df = pd.DataFrame(data)
# make histogram
plt.hist(df['values'], color= 'y', edgecolor = 'white', alpha = 0.7, bins = 5)
# ์, ํฌ๋ช
๋, ๊ฒฝ๊ณ, ๊ตฌ๊ฐ ์ง์ (0.7์ฉ 5๊ฐ๋ก ํํ)
plt.title('Histogram Example')
plt.xlabel('Values')
plt.ylabel('Counts')
print(df['values'].describe())
plt.show()
histogram ๊ตฌ๊ฐ์ ๋ํ ์ดํด
counts, bins, patches
๊ฐ ๊ตฌ๊ฐ ๋ฐ์ดํฐ ๊ฐ์, ๊ตฌ๊ฐ ๊ฒฝ๊ณ๊ฐ ์ ์, ํ์คํ ๊ทธ๋จ ๋ง๋์ ๋ํ ๊ฐ์ฒด ๋ฆฌ์คํธ
์ต์๊ฐ, ์ต๋๊ฐ +๊ฐ๊ฒฉ , ๊ฐ๊ฒฉ์ฉ
def histogram2():
data = np.random.normal(7, 3, 100)
# ๊ฐ ๊ตฌ๊ฐ ๋ฐ์ดํฐ ๊ฐ์, ๊ตฌ๊ฐ ๊ฒฝ๊ณ๊ฐ ์ ์, ํ์คํ ๊ทธ๋จ ๋ง๋์ ๋ํ ๊ฐ์ฒด ๋ฆฌ์คํธ
bin_interval = 2
#counts, bins, patches = plt.hist(data, bins=[0,3,6,9,12,15], edgecolor = 'black')
counts, bins, patches = plt.hist(data,
bins=np.arange(min(data), max(data) + bin_interval, bin_interval),
edgecolor = 'black')
plt.xlabel("bins", fontsize = 12)
plt.ylabel("count", fontsize = 12)
for i in range(len(counts)):
print(f"๊ตฌ๊ฐ {bins[i]:.2f}{bins[i+1]:.2f}: {counts[i]}๊ฐ")
plt.tight_layout()
plt.show()
+ numpy ceil, floor ์์ซ์ ์ฌ๋ฆผ๊ณผ ๋ด๋ฆผ / min, max ์ด์ฉ
def mathwithhist():
data = np.random.normal(7, 3, 100)
# data point ์ ๋ํ ๋ด๋ฆผ, ์ฌ๋ฆผ ๊ณ์ฐ / min, max ๊ณ์ฐ
floored_data = np.floor(data)
ceiled_data = np.ceil(data)
min_val = np.min(floored_data)
max_val = np.max(ceiled_data)
bin_interval = 2
#counts, bins, patches = plt.hist(data, bins=[0,3,6,9,12,15], edgecolor = 'black')
counts, bins, patches = plt.hist(data,
bins=np.arange(min_val, max_val + bin_interval, bin_interval),
edgecolor = 'black')
plt.xlabel("bins", fontsize = 12)
plt.ylabel("count", fontsize = 12)
plt.grid(True) # add grid line
for i in range(len(counts)):
print(f"๊ตฌ๊ฐ {bins[i]:.2f}{bins[i+1]:.2f}: {counts[i]}๊ฐ")
plt.tight_layout()
plt.show()
์ถ๊ฐ ์ฐธ๊ณ ๋งํฌ
subplot ์ธ์, ์ต์ ๋ค ์์ธ ์ฌํญ https://kongdols-room.tistory.com/98
'Data Engineering ์ฌ๋ฐ๋ฐ > Python basic for data' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
๊ฐ๋จํ ํ์ด์ฌ - ์๊ฒฝ๋ ๋ณํํ๊ธฐ (1) | 2024.12.03 |
---|