1 rok pred · fa757e5e48
--- a/ch07/02_dataset-utilities/README.md
+++ b/ch07/02_dataset-utilities/README.md
@@ -12,7 +12,7 @@ pip install -r requirements-extra.txt
 
				 
			
 
				 
			
 
				 
			
 
				-### Finding near duplicates
			
 
				+### Finding Near Duplicates
			
 
				 
			
 
				 The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
			
 
				 
			
@@ -23,6 +23,9 @@ python find-near-duplicates.py --json_file instruction-examples.json
 
				 ```
			
 
				 
			
 
				 ```
			
 
				+scikit-learn version: 1.3.1
			
 
				+
			
 
				+
			
 
				 ==================================================
			
 
				 Searching 'instruction' for duplicates ...
			
 
				 ==================================================
			
@@ -56,6 +59,22 @@ Duplicate pair found with similarity 1.00:
 
				 
			
 
				 ```
			
 
				 
			
 
				-&nbsp
			
 
				+&nbsp;
			
 
				 You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity.
			
 
				 The default threshold is 0.9.
			
 
				+
			
 
				+
			
 
				+
			
 
				+&nbsp;
			
 
				+ ## Creating Passive Voice Entries
			
 
				+
			
 
				+ - The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below
			
 
				+
			
 
				+ ```python
			
 
				+ {  
			
 
				+    'instruction': 'Identify the verb in the following sentence',
			
 
				+    'input': 'The cat sleeps on the couch.',
			
 
				+    'output': 'The verb in the sentence is "sleeps."',
			
 
				+    'output_2': 'The sentence is "sleeps."'   #  <---- Newly created entry
			
 
				+ }  
			
 
				+ ```
			
--- a/ch07/02_dataset-utilities/find-near-duplicates.py
+++ b/ch07/02_dataset-utilities/find-near-duplicates.py
@@ -61,7 +61,7 @@ def find_near_duplicates(json_data, threshold=0.75, key="instruction"):
 
				     for i in range(len(cos_sim_matrix)):
			
 
				         for j in range(i+1, len(cos_sim_matrix)):
			
 
				             if cos_sim_matrix[i, j] > threshold:
			
 
				-                if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <=1:
			
 
				+                if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1:
			
 
				                     continue
			
 
				                 near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
			
 
				                 if key in ("input", "output"):  # Don't remove duplicates based on the instruction