Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
software_development:python_pandas [2022/08/04 14:40] prgram [I/O file] |
software_development:python_pandas [2023/05/16 15:17] (current) prgram [encoding_errors - 'ignore'] |
||
---|---|---|---|
Line 1: | Line 1: | ||
====== python pandas ====== | ====== python pandas ====== | ||
{{INLINETOC}} | {{INLINETOC}} | ||
+ | |||
+ | === etc : list === | ||
+ | <code python> | ||
+ | set( [list] ) # unique value | ||
+ | [list].sort() #자동적용? | ||
+ | [list1] + [list2] #list합치기 | ||
+ | </code> | ||
+ | |||
+ | ===== shape of df ===== | ||
+ | === Pivot_table === | ||
+ | <code python> | ||
+ | df.pivot_table(index=[인덱스컬럼], | ||
+ | columns=[컬럼1,컬럼2], | ||
+ | values=[값], | ||
+ | aggfunc='sum').reset_index() | ||
+ | </code> | ||
+ | * string일 때는 aggfunc='max' | ||
+ | * index에 NULL 있으면 안됨 | ||
+ | == fillna == | ||
+ | <code python> | ||
+ | df[['a','b','c']] = df[['a','b','c']].fillna('') | ||
+ | </code> | ||
+ | |||
+ | === group by === | ||
+ | <code python> | ||
+ | df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index() | ||
+ | |||
+ | df.groupby([COLUMNS])['COLUMN'].max().reset_index() | ||
+ | |||
+ | df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates() | ||
+ | |||
+ | df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드') | ||
+ | df.columns = df.columns.droplevel() | ||
+ | </code> | ||
+ | |||
+ | === rank === | ||
+ | <code python> | ||
+ | df['rank'] = df.groupby('code')['value'].rank(ascending=False) | ||
+ | </code> | ||
+ | |||
+ | === merge === | ||
+ | <code python> | ||
+ | df_out = df_out.merge(df, on=['no', 'name'], how='outer') #left_on right_on | ||
+ | </code> | ||
+ | |||
+ | |||
+ | ===== modify ===== | ||
+ | === Series to DF === | ||
+ | <code python> | ||
+ | df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True) | ||
+ | </code> | ||
+ | |||
+ | |||
+ | === rename === | ||
+ | <code python> | ||
+ | df.rename(columns = {'컬럼':'new name'}, index={0:'new index'}) | ||
+ | df.rename({1: 2, 2: 4}, axis='index') | ||
+ | |||
+ | df.columns = [컬럼들..] | ||
+ | df.columns = ['1'] + df.columns[1:].tolist() | ||
+ | </code> | ||
+ | |||
+ | === order of columns === | ||
+ | <code python> | ||
+ | #1 | ||
+ | df = df.sort_index(axis='columns', level = 'MULTILEVEL INDEX NAME/no') | ||
+ | #2 | ||
+ | df.columns | ||
+ | col_order = ['a','b','c'] | ||
+ | df = df.reindex(col_order, axis='columns') | ||
+ | </code> | ||
+ | |||
+ | |||
+ | === map === | ||
+ | <code python> | ||
+ | df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}') #6글자로 | ||
+ | </code> | ||
+ | |||
+ | |||
+ | ===== get info ===== | ||
=== Shapes === | === Shapes === | ||
Line 13: | Line 93: | ||
df.loc[df.sex=='W', 'year_born'].hist() | df.loc[df.sex=='W', 'year_born'].hist() | ||
+ | |||
+ | === type === | ||
+ | <code python> | ||
+ | df['value'] = pd.to_numeric(df['value'], errors='coerce') | ||
+ | df['value'].dtype | ||
+ | </code> | ||
+ | |||
+ | |||
+ | ===== get element ===== | ||
=== Selects === | === Selects === | ||
Line 18: | Line 107: | ||
iloc: Select by position | iloc: Select by position | ||
loc: Select by label | loc: Select by label | ||
+ | | ||
+ | df.loc[:,~df.columns.isin(['a','b'])] | ||
+ | |||
+ | df[~( df['a'].isin(['1','2','3']) & df['b']=='3' )] #row-wise | ||
+ | df.loc[~( df['a'].isin(['1','2','3']) & df['b']=='3' ), 8] #row-wise & column | ||
</code> | </code> | ||
+ | === row iteration === | ||
+ | <code python> | ||
+ | for idx,row in anime[:2].iterrows(): | ||
+ | print(idx, row) | ||
+ | </code> | ||
| | ||
| | ||
=====I/O file===== | =====I/O file===== | ||
+ | |||
+ | === encoding_errors - 'ignore'=== | ||
+ | Encoding 제대로 했는데도 안되면.. | ||
+ | 공공데이터가 이런 경우가 많음. | ||
+ | |||
+ | Error tokenizing data. C error: EOF inside string starting at row 0 | 판다스 에러 | ||
+ | https://con2joa.tistory.com/m/60 | ||
+ | quoting=csv.QUOTE_NONE 파라미터 | ||
+ | |||
+ | <code python> | ||
+ | import chardet | ||
+ | with open(file, 'rb') as rawdata: | ||
+ | result = chardet.detect(rawdata.read(100000)) | ||
+ | result | ||
+ | |||
+ | |||
+ | data = pd.read_csv( file, encoding='cp949', encoding_errors='ignore') | ||
+ | # on_bad_lines='skip' | ||
+ | # error_bad_lines=False | ||
+ | </code> | ||
+ | |||
+ | === to_numberic === | ||
+ | <code python> | ||
+ | #1 | ||
+ | df = pd.read_csv('file.csv', encoding='utf-8', index_col=0, converters={'col':int, 'col2':str}) | ||
+ | #2 | ||
+ | df['col'] = pd.to_numeric(df[col].str.replace(',',''), errors='coerce') | ||
+ | </code> | ||
=== Excel === | === Excel === | ||
Line 61: | Line 188: | ||
file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/ | file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/ | ||
- | === row iteration === | ||
- | <code python> | ||
- | for idx,row in anime[:2].iterrows(): | ||
- | print(idx, row) | ||
- | </code> | ||
- | === Series to DF === | ||
- | <code python> | ||
- | df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True) | ||
- | </code> | ||
- | === rename === | ||
- | <code python> | ||
- | df.rename(columns = {'컬럼':'new name'}, index={0:'new index'}) | ||
- | df.rename({1: 2, 2: 4}, axis='index') | ||
- | df.columns = [컬럼들..] | ||
- | df.columns = ['1'] + df.columns[1:].tolist() | ||
- | </code> | ||
- | === map === | ||
- | <code python> | ||
- | df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}') #6글자로 | ||
- | </code> | ||
- | === Pivot_table === | ||
- | <code python> | ||
- | df.pivot_table(index=[인덱스컬럼], | ||
- | columns=[컬럼1,컬럼2], | ||
- | values=[값], | ||
- | aggfunc='sum').reset_index() | ||
- | </code> | ||
- | === group by === | ||
- | <code python> | ||
- | df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index() | ||
- | |||
- | df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates() | ||
- | |||
- | df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드') | ||
- | df.columns = df.columns.droplevel() | ||
- | </code> | ||
- | |||
- | === merge === | ||
- | <code python> | ||
- | df_out = df_out.merge(df, on=['no', 'name'], how='outer') | ||
- | </code> | ||
- | |||
- | === rank === | ||
- | <code python> | ||
- | df['rank'] = df.groupby('code')['value'].rank(ascending=False) | ||
- | </code> | ||
- | === to_numberic === | ||
- | <code python> | ||
- | #1 | ||
- | df = pd.read_csv('file.csv', encoding='utf-8', index_col=0, converters={'col':int, 'col2':str}) | ||
- | #2 | ||
- | df['col'] = pd.to_numeric(df[col].str.replace(',',''), errors='coerce') | ||
- | </code> | ||
- | |||
- | |||
- | |||
- | === type === | ||
- | <code python> | ||
- | df['value'] = pd.to_numeric(df['value'], errors='coerce') | ||
- | df['value'].dtype | ||
- | </code> | ||
{{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}} | {{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}} |