In e-commerce applications operating on a marketplace model, a significant challenge is the exchange of phone numbers and email addresses through messages between vendors and customers. This allows them to bypass the platform, resulting in a loss of commission revenue for the site. Users may employ inventive methods to evade standard detection algorithms that rely on regular expressions, such as spelling out numbers, e.g. "fivethreetwo" instead of "532". To develop a comprehensive list of such techniques, you could prompt ChatGPT with: 'I have a webpage with a messaging feature. I want to prevent the inclusion of phone numbers and emails in messages. What are some ways users might try to circumvent my safeguards?'
This issue presents an ideal challenge for AI to address. I experimented with OpenAI's text-davinci-003 model but had little success, it failed for most of my test cases. Then I tried the gpt-4 model which was much better. I wrote the following PHP script (with help from ChatGPT4) to tackle the phone number and email detection problem in Turkish text messages. Note that to verify that an AI script works, due to the random nature of AI, you should run it a couple of times with the same input to be sure that it provides the expected output every single time. Note also that the message 'şunu dene jane at example 532 222 33', causes AI to fail 50% of the time, because AI counts number of digits as 9 but it is 8! It also sometimes detects an email and sometimes not, so there is still some prompt engineering work to do.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//Blog post: https://mentoringcomputer.blogspot.com/2023/11/detecting-phone-number-and-email-in.html | |
// Your OpenAI API key | |
$apiKey = 'your-api-key'; | |
$user_data_list = [ | |
[ | |
'user_message' => "şunu dene beşyüzotuz sonra bakarız", | |
'containsEmail' => false, | |
'containsPhone' => false | |
], | |
[ | |
'user_message' => "şunu dene 530 sonra bakarız", | |
'containsEmail' => false, | |
'containsPhone' => false | |
], | |
[ | |
'user_message' => "şunu dene beşyüzotuziki ikiyüzyirmiiki otuzüç beş sonra bakarız", | |
'containsEmail' => false, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => 'şunu dene 532 222 33 25 sonra bakarız', | |
'containsEmail' => false, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => 'şunu dene jane(at)example', | |
'containsEmail' => true, | |
'containsPhone' => false | |
], | |
[ | |
'user_message' => 'şunu dene jane (at) example(nokta)com', | |
'containsEmail' => true, | |
'containsPhone' => false | |
], | |
[ | |
'user_message' => 'şunu dene jane(at)example 532 222 33 ', | |
'containsEmail' => true, | |
'containsPhone' => false | |
], | |
[ | |
'user_message' => 'şunu dene jane at example 532 222 33', | |
'containsEmail' => true, | |
'containsPhone' => false | |
], | |
[ | |
'user_message' => "Bana 05322222222'den ulaşabilirsiniz", | |
'containsEmail' => false, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => "İletişim bilgilerim sıfır beş üç iki iki iki iki iki iki iki bir, abc at gmail nokta com", | |
'containsEmail' => true, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => "İletişim bilgilerim 0 beş 3 iki 2 iki 1 iki * 2 * 1, abc*gmail*nokta*com", | |
'containsEmail' => true, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => "İletişim bilgilerim 0 beş 3 iki 2 iki 1 iki 2 1, abc*gmail*nokta*com", | |
'containsEmail' => true, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => 'şu: 5*3*2*2*1*2*2*1*0', | |
'containsEmail' => false, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => 'şu: 5-3-2-2-1-2-2-1-0', | |
'containsEmail' => false, | |
'containsPhone' => true | |
], | |
[ | |
'user_message' => 'İletişim bilgilerim 0 beş 3 iki 2 iki 1 iki 2 1, abc*gmail*nokta*com', | |
'containsEmail' => true, | |
'containsPhone' => true | |
] | |
]; | |
//failing tests: | |
/*$user_data_list = [ | |
[ | |
'user_message' => 'şunu dene jane at example 532 222 33', //50% of the time, AI counts number of digits as 9 but it is 8! It also sometimes detects an email and sometimes not. | |
'containsEmail' => true, | |
'containsPhone' => false | |
] | |
];*/ | |
$containsEmailResponse = 'containsEmail'; | |
$containsPhoneResponse = 'containsPhone'; | |
$allTestsPassed = true; | |
$failedTestMessages = []; | |
foreach ($user_data_list as $index => $user_data) { | |
$iTest = $index + 1; | |
echo "TEST ".$iTest.", user_message: ".$user_data['user_message'].PHP_EOL; | |
// The prompt you want to send to the chat | |
$prompt = "In the following Turkish message, count the number of digits. If there are words representing numbers, convert them to digits and count them too. Text: {$user_data['user_message']}. If the number of digits is strictly more than 8, return converted text, number of digits and the text '{$containsPhoneResponse}'. If not, return the text 'no phone number in message'. If the message contains an email, add text '{$containsEmailResponse}' to the respone, if not add 'no email in message'."; | |
// Data to be sent to the API | |
$data = [ | |
'model' => 'gpt-4-1106-preview', | |
'messages' => [ | |
['role' => 'system', 'content' => 'You are a helpful assistant.'], | |
['role' => 'user', 'content' => $prompt], | |
], | |
]; | |
// Initialize cURL session | |
$ch = curl_init('https://api.openai.com/v1/chat/completions'); | |
// Set cURL options | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_POST, true); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data)); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, [ | |
'Content-Type: application/json', | |
'Authorization: Bearer ' . $apiKey | |
]); | |
// Execute cURL session and get the response | |
$response = curl_exec($ch); | |
// Check for cURL errors | |
if (curl_errno($ch)) { | |
echo 'cURL error: ' . curl_error($ch); | |
} | |
// Close cURL session | |
curl_close($ch); | |
// Decode the JSON response | |
$responseData = json_decode($response, true); | |
// Handle the response | |
if (isset($responseData['choices'][0]['message']['content'])) { | |
//var_dump($responseData); | |
$content = $responseData['choices'][0]['message']['content']; | |
echo "Response from OpenAI: " . $content . PHP_EOL; | |
// Parsing the response to extract the information | |
$contains_email = stripos($content, $containsEmailResponse) !== false; | |
$contains_phone = stripos($content, $containsPhoneResponse) !== false; | |
echo "Contains Email: " . ($contains_email ? "Yes" : "No") . ", expected: ".($user_data['containsEmail'] ? "Yes" : "No")."\n"; | |
echo "Contains Phone Number: " . ($contains_phone ? "Yes" : "No") . ", expected: ".($user_data['containsPhone'] ? "Yes" : "No")."\n"; | |
echo "Test ".$iTest.": "; | |
$testPassed = $contains_email === $user_data['containsEmail'] && $contains_phone === $user_data['containsPhone']; | |
$allTestsPassed = $allTestsPassed && $testPassed; | |
if ($testPassed) { | |
echo "Pass"; | |
} else { | |
echo "FAIL"; | |
$failedTestMessages[] = $user_data['user_message']; | |
} | |
echo PHP_EOL; | |
} else { | |
echo "Error: "; | |
print_r($responseData); | |
} | |
echo "------------------------------------------------------------------\n"; | |
} | |
echo "SUMMARY:\n"; | |
if ($allTestsPassed) { | |
echo "All tests passed"; | |
} else { | |
echo "Some tests FAILED. Failed messages:\n"; | |
foreach($failedTestMessages as $msg) { | |
echo $msg.PHP_EOL; | |
} | |
} | |
?> |
No comments:
Post a Comment