Differences

This shows you the differences between two versions of the page.

--- software_development:python_pandas [2022/08/04 14:39]
prgram
+++ software_development:python_pandas [2023/05/16 15:09]
prgram [encoding_errors - 'ignore']
@@ Line 2: / Line 2: @@
 {{INLINETOC}}
-=== Shapes ===
+=== etc : list ===
 <code python>
-df.shape
+set( [list] ) # unique value
-df.describe()
+[list].sort() #자동적용?
-df.info()
+[list1] + [list2]  #list합치기
 </code>
-#use the dataframe.nunique() function to find the unique values
+===== shape of df =====
-unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
+=== Pivot_table ===
-df.loc[df.sex=='W', 'year_born'].hist()
-=== Selects ===
 <code python>
-iloc: Select by position
+df.pivot_table(index=[인덱스컬럼],
-loc: Select by label
+               columns=[컬럼1,컬럼2],
+               values=[값],
+               aggfunc='sum').reset_index()
+</code>
+  * string일 때는 aggfunc='max'
+  * index에 NULL 있으면 안됨
+== fillna ==
+<code python>
+df[['a','b','c']] = df[['a','b','c']].fillna('')
 </code>
+=== group by ===
+<code python>
+df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index()
+df.groupby([COLUMNS])['COLUMN'].max().reset_index()
-==I/O file==
+df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates()
-=== chunk ===
+df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드')
-<code python>
+df.columns = df.columns.droplevel()
-for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1):
-  chunk.memory_usage()
 </code>
+=== rank ===
 <code python>
-df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines
+df['rank'] = df.groupby('code')['value'].rank(ascending=False)
-for chunk in df_chunk:
-    # perform data filtering
-    chunk_filter = chunk_preprocessing(chunk)
 </code>
+=== merge ===
 <code python>
-df3=pd.read_csv("abc.txt", skiprows=1, header=None,
+df_out = df_out.merge(df, on=['no', 'name'], how='outer')	#left_on right_on
-               sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False)
-with open("abc.txt", encoding="utf-16") as f:
-    i=0
-    for line in f:
-        i=i+1
-        print(line)
-        if i >3 : break
-</code>
-file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/
-=== row iteration ===
-<code python>
-for idx,row in anime[:2].iterrows():
-    print(idx, row)
 </code>
+===== modify =====
 === Series to DF ===
 <code python>
 df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True)
 </code>
 === rename ===
 <code python>
@@ Line 72: / Line 62: @@
 df.columns = ['1'] + df.columns[1:].tolist()
 </code>
+=== order of columns ===
+<code python>
+#1
+df = df.sort_index(axis='columns', level = 'MULTILEVEL INDEX NAME/no')
+#2
+df.columns
+col_order = ['a','b','c']
+df = df.reindex(col_order, axis='columns')
+</code>
 === map ===
@@ Line 77: / Line 78: @@
 df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}')  #6글자로
 </code>
-=== Pivot_table ===
+===== get info =====
+=== Shapes ===
 <code python>
-df.pivot_table(index=[인덱스컬럼],
+df.shape
-               columns=[컬럼1,컬럼2],
+df.describe()
-               values=[값],
+df.info()
-               aggfunc='sum').reset_index()
 </code>
-=== group by ===
+#use the dataframe.nunique() function to find the unique values
+unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
+df.loc[df.sex=='W', 'year_born'].hist()
+=== type ===
 <code python>
-df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index()
+df['value'] = pd.to_numeric(df['value'], errors='coerce')
+df['value'].dtype
+</code>
-df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates()
-df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드')
+===== get element =====
-df.columns = df.columns.droplevel()
+=== Selects ===
+<code python>
+iloc: Select by position
+loc: Select by label
+df.loc[:,~df.columns.isin(['a','b'])]
+df[~( df['a'].isin(['1','2','3']) & df['b']=='3' )]		#row-wise
+df.loc[~( df['a'].isin(['1','2','3']) & df['b']=='3' ), 8]	#row-wise & column
 </code>
-=== merge ===
+=== row iteration ===
 <code python>
-df_out = df_out.merge(df, on=['no', 'name'], how='outer')
+for idx,row in anime[:2].iterrows():
+    print(idx, row)
 </code>
-=== rank ===
+=====I/O file=====
+=== encoding_errors - 'ignore'===
+Encoding 제대로 했는데도 안되면..
+공공데이터가 이런 경우가 많음.
 <code python>
-df['rank'] = df.groupby('code')['value'].rank(ascending=False)
+import chardet
+with open(file, 'rb') as rawdata:
+    result = chardet.detect(rawdata.read(100000))
+result
+data = pd.read_csv( file, encoding='cp949', encoding_errors='ignore')
+# on_bad_lines='skip'
+# error_bad_lines=False
 </code>
 === to_numberic ===
 <code python>
@@ Line 120: / Line 156: @@
 </code>
+=== chunk ===
+<code python>
+for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1):
+  chunk.memory_usage()
+</code>
-=== type ===
 <code python>
-df['value'] = pd.to_numeric(df['value'], errors='coerce')
+df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines
-df['value'].dtype
+for chunk in df_chunk:
+    # perform data filtering
+    chunk_filter = chunk_preprocessing(chunk)
 </code>
+<code python>
+df3=pd.read_csv("abc.txt", skiprows=1, header=None,
+               sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False)
+with open("abc.txt", encoding="utf-16") as f:
+    i=0
+    for line in f:
+        i=i+1
+        print(line)
+        if i >3 : break
+</code>
+file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/
 {{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}}